[llvm] [WIP][X86] combineX86ShufflesRecursively - attempt to combine shuffles with larger types from EXTRACT_SUBVECTOR nodes (PR #133947)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 2 02:10:11 PDT 2025
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/133947
>From b54089c363d54bbb754cb9de5b184d483953e5d2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 1 Apr 2025 18:30:11 +0100
Subject: [PATCH] [WIP][X86] combineX86ShufflesRecursively - attempt to combine
shuffles with larger types from EXTRACT_SUBVECTOR nodes
This replaces the rather limited combineX86ShuffleChainWithExtract function with handling for EXTRACT_SUBVECTOR node as we recurse down the shuffle chain, widening the shuffle mask to accomodate the larger value type.
This will mainly help AVX2/AVX512 cases with cross-lane shuffles, but it also helps collapse some cases where the same subvector has gotten reused in multiple lanes.
Exposed missing DemandedElts handling inside ISD::TRUNCATE nodes for ComputeNumSignBits
---
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 3 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 230 +-
.../any_extend_vector_inreg_of_broadcast.ll | 432 +-
...d_vector_inreg_of_broadcast_from_memory.ll | 60 +-
llvm/test/CodeGen/X86/avx512-cvt.ll | 8 +-
llvm/test/CodeGen/X86/avx512-hadd-hsub.ll | 7 +-
.../X86/avx512-shuffles/partial_permute.ll | 995 +-
llvm/test/CodeGen/X86/bfloat.ll | 53 +-
llvm/test/CodeGen/X86/combine-pmuldq.ll | 26 +-
llvm/test/CodeGen/X86/combine-sra.ll | 60 +-
.../copy-low-subvec-elt-to-high-subvec-elt.ll | 38 +-
llvm/test/CodeGen/X86/haddsub-undef.ll | 93 +-
llvm/test/CodeGen/X86/haddsub.ll | 66 +-
.../test/CodeGen/X86/known-signbits-vector.ll | 66 +-
llvm/test/CodeGen/X86/madd.ll | 6 +-
llvm/test/CodeGen/X86/masked_compressstore.ll | 453 +-
.../test/CodeGen/X86/masked_gather_scatter.ll | 38 +-
llvm/test/CodeGen/X86/masked_store.ll | 44 +-
llvm/test/CodeGen/X86/matrix-multiply.ll | 54 +-
.../CodeGen/X86/min-legal-vector-width.ll | 10 +-
llvm/test/CodeGen/X86/oddshuffles.ll | 8 +-
llvm/test/CodeGen/X86/pmul.ll | 35 +-
llvm/test/CodeGen/X86/pr132844.ll | 10 +-
llvm/test/CodeGen/X86/pr29112.ll | 4 +-
llvm/test/CodeGen/X86/pr62286.ll | 5 +-
llvm/test/CodeGen/X86/pr95278.ll | 4 +-
llvm/test/CodeGen/X86/pr97968.ll | 4 +-
llvm/test/CodeGen/X86/shift-i512.ll | 18 +-
llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll | 384 +-
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll | 24 +-
llvm/test/CodeGen/X86/trunc-subvector.ll | 19 +-
.../CodeGen/X86/vec-strict-fptoint-256.ll | 70 +-
.../CodeGen/X86/vec-strict-fptoint-512.ll | 33 +-
.../CodeGen/X86/vec-strict-inttofp-256.ll | 844 +-
.../CodeGen/X86/vec-strict-inttofp-512.ll | 152 +-
llvm/test/CodeGen/X86/vec_fp_to_int.ll | 52 +-
llvm/test/CodeGen/X86/vector-compress.ll | 39 +-
.../CodeGen/X86/vector-half-conversions.ll | 624 +-
.../vector-interleaved-load-i16-stride-3.ll | 3766 ++--
.../vector-interleaved-load-i16-stride-4.ll | 2028 +--
.../vector-interleaved-load-i16-stride-5.ll | 9492 +++++-----
.../vector-interleaved-load-i16-stride-6.ll | 8731 +++++----
.../vector-interleaved-load-i16-stride-7.ll | 7156 ++++----
.../vector-interleaved-load-i16-stride-8.ll | 6688 +++----
.../vector-interleaved-load-i32-stride-3.ll | 566 +-
.../vector-interleaved-load-i32-stride-5.ll | 390 +-
.../vector-interleaved-load-i32-stride-6.ll | 2544 +--
.../vector-interleaved-load-i32-stride-7.ll | 3506 ++--
.../vector-interleaved-load-i64-stride-3.ll | 128 +-
.../vector-interleaved-load-i64-stride-4.ll | 88 +-
.../vector-interleaved-load-i64-stride-5.ll | 368 +-
.../vector-interleaved-load-i64-stride-6.ll | 132 +-
.../vector-interleaved-load-i64-stride-7.ll | 14817 ++++++++--------
.../vector-interleaved-load-i64-stride-8.ll | 220 +-
.../vector-interleaved-load-i8-stride-3.ll | 71 +-
.../vector-interleaved-load-i8-stride-5.ll | 296 +-
.../vector-interleaved-load-i8-stride-7.ll | 8000 ++++-----
.../vector-interleaved-load-i8-stride-8.ll | 10748 +++++------
.../vector-interleaved-store-i16-stride-3.ll | 1492 +-
.../vector-interleaved-store-i16-stride-5.ll | 188 +-
.../vector-interleaved-store-i16-stride-6.ll | 2250 +--
.../vector-interleaved-store-i16-stride-7.ll | 8552 +++++----
.../vector-interleaved-store-i32-stride-6.ll | 2094 +--
.../vector-interleaved-store-i32-stride-7.ll | 946 +-
.../vector-interleaved-store-i32-stride-8.ll | 2 +-
.../vector-interleaved-store-i64-stride-5.ll | 631 +-
.../vector-interleaved-store-i8-stride-5.ll | 808 +-
.../vector-interleaved-store-i8-stride-6.ll | 486 +-
.../vector-interleaved-store-i8-stride-7.ll | 5512 +++---
.../vector-interleaved-store-i8-stride-8.ll | 645 +-
llvm/test/CodeGen/X86/vector-llrint.ll | 16 +-
llvm/test/CodeGen/X86/vector-lrint.ll | 16 +-
llvm/test/CodeGen/X86/vector-reduce-fadd.ll | 558 +-
llvm/test/CodeGen/X86/vector-reduce-fmax.ll | 101 +-
llvm/test/CodeGen/X86/vector-reduce-fmin.ll | 101 +-
llvm/test/CodeGen/X86/vector-reduce-fmul.ll | 222 +-
.../CodeGen/X86/vector-shuffle-128-unpck.ll | 60 +-
.../CodeGen/X86/vector-shuffle-256-v16.ll | 105 +-
.../CodeGen/X86/vector-shuffle-256-v32.ll | 52 +-
.../test/CodeGen/X86/vector-shuffle-256-v4.ll | 56 +-
.../test/CodeGen/X86/vector-shuffle-256-v8.ll | 353 +-
.../CodeGen/X86/vector-shuffle-512-v16.ll | 41 +-
.../CodeGen/X86/vector-shuffle-512-v64.ll | 124 +-
.../test/CodeGen/X86/vector-shuffle-512-v8.ll | 3 +-
.../test/CodeGen/X86/vector-shuffle-avx512.ll | 93 +-
.../X86/vector-shuffle-combining-avx.ll | 76 +-
.../CodeGen/X86/vector-shuffle-combining.ll | 25 +-
llvm/test/CodeGen/X86/vector-shuffle-v192.ll | 62 +-
llvm/test/CodeGen/X86/vector-trunc.ll | 56 +-
llvm/test/CodeGen/X86/vselect-packss.ll | 6 +-
llvm/test/CodeGen/X86/widen_fadd.ll | 31 +-
llvm/test/CodeGen/X86/widen_fdiv.ll | 48 +-
llvm/test/CodeGen/X86/widen_fmul.ll | 31 +-
llvm/test/CodeGen/X86/widen_fsub.ll | 31 +-
.../CodeGen/X86/x86-interleaved-access.ll | 196 +-
.../CodeGen/X86/zero_extend_vector_inreg.ll | 70 +-
.../zero_extend_vector_inreg_of_broadcast.ll | 911 +-
...d_vector_inreg_of_broadcast_from_memory.ll | 191 +-
98 files changed, 56574 insertions(+), 55473 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 3526beeb312ce..2da00a42f0f9c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5067,7 +5067,8 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
case ISD::TRUNCATE: {
// Check if the sign bits of source go down as far as the truncated value.
unsigned NumSrcBits = Op.getOperand(0).getScalarValueSizeInBits();
- unsigned NumSrcSignBits = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ unsigned NumSrcSignBits =
+ ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
if (NumSrcSignBits > (NumSrcBits - VTBits))
return NumSrcSignBits - (NumSrcBits - VTBits);
break;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8e6a891444bf1..6b7d82bdcedf1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39653,13 +39653,6 @@ static bool matchBinaryPermuteShuffle(
return false;
}
-static SDValue combineX86ShuffleChainWithExtract(
- ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
- ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
- bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
- bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
- const X86Subtarget &Subtarget);
-
/// Combine an arbitrary chain of shuffles into a single instruction if
/// possible.
///
@@ -40203,14 +40196,6 @@ static SDValue combineX86ShuffleChain(
return DAG.getBitcast(RootVT, Res);
}
- // If that failed and either input is extracted then try to combine as a
- // shuffle with the larger type.
- if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
- Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
- AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
- IsMaskedShuffle, DAG, DL, Subtarget))
- return WideShuffle;
-
// If we have a dual input lane-crossing shuffle then lower to VPERMV3,
// (non-VLX will pad to 512-bit shuffles).
if (AllowVariableCrossLaneMask && !MaskContainsZeros &&
@@ -40376,14 +40361,6 @@ static SDValue combineX86ShuffleChain(
return DAG.getBitcast(RootVT, Res);
}
- // If that failed and either input is extracted then try to combine as a
- // shuffle with the larger type.
- if (SDValue WideShuffle = combineX86ShuffleChainWithExtract(
- Inputs, RootOpc, RootVT, BaseMask, Depth, SrcNodes,
- AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
- DAG, DL, Subtarget))
- return WideShuffle;
-
// If we have a dual input shuffle then lower to VPERMV3,
// (non-VLX will pad to 512-bit shuffles)
if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros &&
@@ -40409,154 +40386,6 @@ static SDValue combineX86ShuffleChain(
return SDValue();
}
-// Combine an arbitrary chain of shuffles + extract_subvectors into a single
-// instruction if possible.
-//
-// Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger
-// type size to attempt to combine:
-// shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1)
-// -->
-// extract_subvector(shuffle(x,y,m2),0)
-static SDValue combineX86ShuffleChainWithExtract(
- ArrayRef<SDValue> Inputs, unsigned RootOpcode, MVT RootVT,
- ArrayRef<int> BaseMask, int Depth, ArrayRef<const SDNode *> SrcNodes,
- bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask,
- bool IsMaskedShuffle, SelectionDAG &DAG, const SDLoc &DL,
- const X86Subtarget &Subtarget) {
- unsigned NumMaskElts = BaseMask.size();
- unsigned NumInputs = Inputs.size();
- if (NumInputs == 0)
- return SDValue();
-
- unsigned RootSizeInBits = RootVT.getSizeInBits();
- unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts;
- assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask");
-
- // Peek through subvectors to find widest legal vector.
- // TODO: Handle ISD::TRUNCATE
- unsigned WideSizeInBits = RootSizeInBits;
- for (SDValue Input : Inputs) {
- Input = peekThroughBitcasts(Input);
- while (1) {
- if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
- Input = peekThroughBitcasts(Input.getOperand(0));
- continue;
- }
- if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
- Input.getOperand(0).isUndef()) {
- Input = peekThroughBitcasts(Input.getOperand(1));
- continue;
- }
- break;
- }
- if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) &&
- WideSizeInBits < Input.getValueSizeInBits())
- WideSizeInBits = Input.getValueSizeInBits();
- }
-
- // Bail if we fail to find a source larger than the existing root.
- unsigned Scale = WideSizeInBits / RootSizeInBits;
- if (WideSizeInBits <= RootSizeInBits ||
- (WideSizeInBits % RootSizeInBits) != 0)
- return SDValue();
-
- // Create new mask for larger type.
- SmallVector<int, 64> WideMask(BaseMask);
- for (int &M : WideMask) {
- if (M < 0)
- continue;
- M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts);
- }
- WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef);
-
- // Attempt to peek through inputs and adjust mask when we extract from an
- // upper subvector.
- int AdjustedMasks = 0;
- SmallVector<SDValue, 4> WideInputs(Inputs);
- for (unsigned I = 0; I != NumInputs; ++I) {
- SDValue &Input = WideInputs[I];
- Input = peekThroughBitcasts(Input);
- while (1) {
- if (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) {
- uint64_t Idx = Input.getConstantOperandVal(1);
- if (Idx != 0) {
- ++AdjustedMasks;
- unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits();
- Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits;
-
- int lo = I * WideMask.size();
- int hi = (I + 1) * WideMask.size();
- for (int &M : WideMask)
- if (lo <= M && M < hi)
- M += Idx;
- }
- Input = peekThroughBitcasts(Input.getOperand(0));
- continue;
- }
- // TODO: Handle insertions into upper subvectors.
- if (Input.getOpcode() == ISD::INSERT_SUBVECTOR &&
- Input.getOperand(0).isUndef() &&
- isNullConstant(Input.getOperand(2))) {
- Input = peekThroughBitcasts(Input.getOperand(1));
- continue;
- }
- break;
- }
- }
-
- // Remove unused/repeated shuffle source ops.
- resolveTargetShuffleInputsAndMask(WideInputs, WideMask);
- assert(!WideInputs.empty() && "Shuffle with no inputs detected");
-
- // Bail if we're always extracting from the lowest subvectors,
- // combineX86ShuffleChain should match this for the current width, or the
- // shuffle still references too many inputs.
- if (AdjustedMasks == 0 || WideInputs.size() > 2)
- return SDValue();
-
- // Minor canonicalization of the accumulated shuffle mask to make it easier
- // to match below. All this does is detect masks with sequential pairs of
- // elements, and shrink them to the half-width mask. It does this in a loop
- // so it will reduce the size of the mask to the minimal width mask which
- // performs an equivalent shuffle.
- while (WideMask.size() > 1) {
- SmallVector<int, 64> WidenedMask;
- if (!canWidenShuffleElements(WideMask, WidenedMask))
- break;
- WideMask = std::move(WidenedMask);
- }
-
- // Canonicalization of binary shuffle masks to improve pattern matching by
- // commuting the inputs.
- if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) {
- ShuffleVectorSDNode::commuteMask(WideMask);
- std::swap(WideInputs[0], WideInputs[1]);
- }
-
- // Increase depth for every upper subvector we've peeked through.
- Depth += AdjustedMasks;
-
- // Attempt to combine wider chain.
- // TODO: Can we use a better Root?
- SDValue WideRoot = WideInputs.front().getValueSizeInBits() >
- WideInputs.back().getValueSizeInBits()
- ? WideInputs.front()
- : WideInputs.back();
- assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&
- "WideRootSize mismatch");
-
- if (SDValue WideShuffle = combineX86ShuffleChain(
- WideInputs, RootOpcode, WideRoot.getSimpleValueType(), WideMask,
- Depth, SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask,
- IsMaskedShuffle, DAG, SDLoc(WideRoot), Subtarget)) {
- WideShuffle = extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits);
- return DAG.getBitcast(RootVT, WideShuffle);
- }
-
- return SDValue();
-}
-
// Canonicalize the combined shuffle mask chain with horizontal ops.
// NOTE: This may update the Ops and Mask.
static SDValue canonicalizeShuffleMaskWithHorizOp(
@@ -40969,6 +40798,57 @@ static SDValue combineX86ShufflesRecursively(
OpMask.assign(NumElts, SM_SentinelUndef);
std::iota(OpMask.begin(), OpMask.end(), ExtractIdx);
OpZero = OpUndef = APInt::getZero(NumElts);
+ } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ TLI.isTypeLegal(Op.getOperand(0).getValueType()) &&
+ Op.getOperand(0).getValueSizeInBits() > RootSizeInBits &&
+ (Op.getOperand(0).getValueSizeInBits() % RootSizeInBits) == 0) {
+ // Extracting from vector larger than RootVT - scale the mask and attempt to
+ // fold the shuffle with the larger root type, then extract the lower
+ // elements.
+ unsigned Scale = Op.getOperand(0).getValueSizeInBits() / RootSizeInBits;
+ MVT NewRootVT = MVT::getVectorVT(RootVT.getScalarType(),
+ Scale * RootVT.getVectorNumElements());
+ SmallVector<int, 64> NewRootMask(RootMask);
+ NewRootMask.append((Scale - 1) * RootMask.size(), SM_SentinelUndef);
+ for (int &M : NewRootMask)
+ if (0 <= M)
+ M = (M % RootMask.size()) +
+ ((M / RootMask.size()) * NewRootMask.size());
+ // If we're using the lowest subvector, just replace it directly in the src
+ // ops/nodes.
+ SmallVector<SDValue, 16> NewSrcOps(SrcOps);
+ SmallVector<const SDNode *, 16> NewSrcNodes(SrcNodes);
+ if (isNullConstant(Op.getOperand(1))) {
+ NewSrcOps[SrcOpIndex] = Op.getOperand(0);
+ NewSrcNodes.push_back(Op.getNode());
+ }
+ // Don't increase the combine depth - we're effectively working on the same
+ // nodes, just with a wider type.
+ if (SDValue WideShuffle = combineX86ShufflesRecursively(
+ NewSrcOps, SrcOpIndex, RootOpc, NewRootVT, NewRootMask, NewSrcNodes,
+ Depth, MaxDepth, AllowVariableCrossLaneMask,
+ AllowVariablePerLaneMask, IsMaskedShuffle, DAG, DL, Subtarget))
+ return DAG.getBitcast(
+ RootVT, extractSubVector(WideShuffle, 0, DAG, DL, RootSizeInBits));
+ return SDValue();
+ } else if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Op.getOperand(1).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Op.getOperand(1).getOperand(0).getValueSizeInBits() >
+ RootSizeInBits) {
+ // If we're inserting an subvector extracted from a vector larger than
+ // RootVT, then combine the insert_subvector as a shuffle, the extract_subvector
+ // will be folded in a later recursion.
+ SDValue BaseVec = Op.getOperand(0);
+ SDValue SubVec = Op.getOperand(1);
+ int InsertIdx = Op.getConstantOperandVal(2);
+ unsigned NumBaseElts = VT.getVectorNumElements();
+ unsigned NumSubElts = SubVec.getValueType().getVectorNumElements();
+ OpInputs.assign({BaseVec, SubVec});
+ OpMask.assign(NumBaseElts, SM_SentinelUndef);
+ std::iota(OpMask.begin(), OpMask.end(), 0);
+ std::iota(OpMask.begin() + InsertIdx,
+ OpMask.begin() + InsertIdx + NumSubElts, NumBaseElts);
+ OpZero = OpUndef = APInt::getZero(NumBaseElts);
} else {
return SDValue();
}
@@ -41324,12 +41204,7 @@ static SDValue combineX86ShufflesRecursively(
return SDValue();
}
- // If that failed and any input is extracted then try to combine as a
- // shuffle with the larger type.
- return combineX86ShuffleChainWithExtract(
- Ops, RootOpc, RootVT, Mask, Depth, CombinedNodes,
- AllowVariableCrossLaneMask, AllowVariablePerLaneMask, IsMaskedShuffle,
- DAG, DL, Subtarget);
+ return SDValue();
}
/// Helper entry wrapper to combineX86ShufflesRecursively.
@@ -43866,6 +43741,7 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
case X86ISD::UNPCKL:
case X86ISD::UNPCKH:
case X86ISD::BLENDI:
+ case X86ISD::SHUFP:
// Integer ops.
case X86ISD::PACKSS:
case X86ISD::PACKUS:
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index 6f4e7abda8b00..cb7da854e26d4 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -747,31 +747,23 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax
-; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm2, %xmm0
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
+; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -865,31 +857,19 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
-; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm2, %xmm0
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -941,73 +921,91 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
; AVX-NEXT: vmovdqa %xmm0, (%rcx)
; AVX-NEXT: retq
;
-; AVX2-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-FAST-PERLANE-NEXT: vzeroupper
-; AVX2-FAST-PERLANE-NEXT: retq
-;
-; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
-; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
-; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512F-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512F-SLOW: # %bb.0:
+; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX512F-SLOW-NEXT: vpinsrd $2, %eax, %xmm2, %xmm0
+; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-SLOW-NEXT: vzeroupper
+; AVX512F-SLOW-NEXT: retq
;
-; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
-; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512F-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512F-FAST: # %bb.0:
+; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-FAST-NEXT: vzeroupper
+; AVX512F-FAST-NEXT: retq
+;
+; AVX512DQ-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX512DQ-SLOW-NEXT: vpinsrd $2, %eax, %xmm2, %xmm0
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
+;
+; AVX512BW-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512BW-SLOW: # %bb.0:
+; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-SLOW-NEXT: vpinsrd $2, %eax, %xmm2, %xmm0
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT: vzeroupper
+; AVX512BW-SLOW-NEXT: retq
+;
+; AVX512BW-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512BW-FAST: # %bb.0:
+; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT: vzeroupper
+; AVX512BW-FAST-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2011,28 +2009,16 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.
; AVX512DQ-FAST-NEXT: vzeroupper
; AVX512DQ-FAST-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15]
+; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2163,28 +2149,16 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i
; AVX512DQ-FAST-NEXT: vzeroupper
; AVX512DQ-FAST-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15]
+; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2315,28 +2289,16 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
; AVX512DQ-FAST-NEXT: vzeroupper
; AVX512DQ-FAST-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX512BW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
+; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -4512,30 +4474,17 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1]
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
+; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -4646,10 +4595,9 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; AVX512BW-FAST-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,2,3]
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,0,1,0,1]
; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-FAST-NEXT: vzeroupper
@@ -4769,30 +4717,17 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3]
-; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7]
+; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -4890,28 +4825,16 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
-; AVX512BW-SLOW-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,2]
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,7]
+; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -5576,6 +5499,9 @@ define void @vec512_i128_widen_to_i256_factor2_broadcast_to_v2i256_factor2(ptr %
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; AVX1-ONLY: {{.*}}
+; AVX2-FAST: {{.*}}
+; AVX2-FAST-PERLANE: {{.*}}
+; AVX2-SLOW: {{.*}}
; FALLBACK0: {{.*}}
; FALLBACK1: {{.*}}
; FALLBACK10: {{.*}}
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 52f856befa130..9dbfe3bbeb9e4 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -641,25 +641,16 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
-; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0
-; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [16,25,16,27,16,29,0,23]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT: vpermt2w (%rdi), %ymm0, %ymm1
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],mem[7]
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64
%in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
%broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
@@ -735,25 +726,18 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7]
-; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7]
-; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
+; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64
%in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
%broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> poison, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll
index a78d97782e6a3..b25fb7631fadd 100644
--- a/llvm/test/CodeGen/X86/avx512-cvt.ll
+++ b/llvm/test/CodeGen/X86/avx512-cvt.ll
@@ -179,11 +179,11 @@ define <4 x i64> @f64to4sl(<4 x double> %a) {
; NODQ: # %bb.0:
; NODQ-NEXT: vextractf128 $1, %ymm0, %xmm1
; NODQ-NEXT: vcvttsd2si %xmm1, %rax
-; NODQ-NEXT: vmovq %rax, %xmm2
-; NODQ-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; NODQ-NEXT: vcvttsd2si %xmm1, %rax
; NODQ-NEXT: vmovq %rax, %xmm1
-; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; NODQ-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; NODQ-NEXT: vcvttsd2si %xmm2, %rax
+; NODQ-NEXT: vmovq %rax, %xmm2
+; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; NODQ-NEXT: vcvttsd2si %xmm0, %rax
; NODQ-NEXT: vmovq %rax, %xmm2
; NODQ-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
diff --git a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
index ae3777453c68d..c9c5666079e58 100644
--- a/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
+++ b/llvm/test/CodeGen/X86/avx512-hadd-hsub.ll
@@ -221,9 +221,10 @@ define double @fsub_noundef_ee (<8 x double> %x225, <8 x double> %x227) {
;
; SKX-LABEL: fsub_noundef_ee:
; SKX: # %bb.0:
-; SKX-NEXT: vextractf32x4 $2, %zmm1, %xmm0
-; SKX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; SKX-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,4]
+; SKX-NEXT: vpermpd %zmm1, %zmm0, %zmm0
+; SKX-NEXT: vextractf32x4 $2, %zmm1, %xmm1
+; SKX-NEXT: vsubsd %xmm0, %xmm1, %xmm0
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
%x226 = shufflevector <8 x double> %x225, <8 x double> %x227, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index a84424bf7dea9..7e2364a1350bc 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -1374,11 +1374,10 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask0(<16 x i32> %vec, <4 x
define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [5,1,3,4]
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [13,9,11,12]
; CHECK-NEXT: vptestnmd %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-NEXT: vmovdqa %xmm1, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
@@ -1391,11 +1390,10 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i
define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mask1(<16 x i32> %vec, <4 x i32> %mask) {
; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [5,1,3,4]
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [13,9,11,12]
; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1
-; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z}
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 13, i32 9, i32 11, i32 12>
@@ -1773,24 +1771,13 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(ptr %vp, <4 x i32
}
define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
-; CHECK-FAST-LABEL: test_16xi32_to_4xi32_perm_mask9:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [12,9,4,10]
-; CHECK-FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-FAST-NEXT: vzeroupper
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_16xi32_to_4xi32_perm_mask9:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,0,2]
-; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm1, %ymm1
-; CHECK-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2
-; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,1,4,3]
-; CHECK-FAST-PERLANE-NEXT: vpermi2d %xmm2, %xmm1, %xmm0
-; CHECK-FAST-PERLANE-NEXT: vzeroupper
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_16xi32_to_4xi32_perm_mask9:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [12,9,4,10]
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> <i32 12, i32 9, i32 4, i32 10>
ret <4 x i32> %res
}
@@ -1930,103 +1917,101 @@ define <2 x i64> @test_masked_z_4xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64>
}
define <4 x i64> @test_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec) {
-; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1]
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask0:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [6,7,6,5]
+; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask0:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm0
+; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,1]
+; CHECK-FAST-PERLANE-NEXT: retq
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
ret <4 x i64> %res
}
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
-; CHECK-NEXT: vmovdqa %ymm1, %ymm0
-; CHECK-NEXT: retq
- %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
- %cmp = icmp eq <4 x i64> %mask, zeroinitializer
- %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
- ret <4 x i64> %res
-}
-
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
-; CHECK-NEXT: retq
- %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
- %cmp = icmp eq <4 x i64> %mask, zeroinitializer
- %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
- ret <4 x i64> %res
-}
-define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
+; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
; CHECK-FAST: # %bb.0:
; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,4,6,1]
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,7,6,5]
; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-FAST-NEXT: retq
;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask0:
; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1]
+; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,1]
; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-FAST-PERLANE-NEXT: retq
- %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
ret <4 x i64> %res
}
-define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask0(<8 x i64> %vec, <4 x i64> %mask) {
+; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,4,6,1]
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,7,6,5]
; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; CHECK-FAST-NEXT: retq
;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask0:
; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1]
+; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,1]
; CHECK-FAST-PERLANE-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 7, i32 6, i32 5>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
+ ret <4 x i64> %res
+}
+define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,4,6,1]
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
+ %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
+ %cmp = icmp eq <4 x i64> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
+ ret <4 x i64> %res
+}
+
+define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) {
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,4,6,1]
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 4, i32 6, i32 1>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
ret <4 x i64> %res
}
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,3,6,3]
-; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
-; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
-; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
-; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,3,6,3]
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
@@ -2034,21 +2019,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64
}
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,3,6,3]
-; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
-; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,3,6,3]
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -2064,9 +2041,8 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) {
;
; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask3:
; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3]
+; CHECK-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,6,7,4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,0,0,3]
; CHECK-FAST-PERLANE-NEXT: retq
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 0, i32 0, i32 7>
ret <4 x i64> %res
@@ -2083,8 +2059,7 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64
;
; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask3:
; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,6,7,4,5,6,7]
; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3]
; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
@@ -2106,8 +2081,7 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i
;
; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3:
; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; CHECK-FAST-PERLANE-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,6,7,4,5,6,7]
; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3]
; CHECK-FAST-PERLANE-NEXT: retq
@@ -2117,23 +2091,14 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i
ret <4 x i64> %res
}
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,7,7,5]
-; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
-; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
-; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,3,1]
-; CHECK-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm0
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,7,7,5]
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
@@ -2141,21 +2106,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64
}
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,7,5]
-; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
-; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,3,1]
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,7,7,5]
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 3, i32 7, i32 7, i32 5>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -2190,40 +2147,24 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask5(<8 x i64> %vec, <4 x i
ret <4 x i64> %res
}
define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) {
-; CHECK-FAST-LABEL: test_8xi64_to_4xi64_perm_mask6:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,6,5,3]
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_4xi64_perm_mask6:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3]
-; CHECK-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [7,6,5,3]
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT: retq
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
ret <4 x i64> %res
}
define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,6,5,3]
-; CHECK-FAST-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
-; CHECK-FAST-NEXT: vmovdqa %ymm1, %ymm0
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3]
-; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm2, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [7,6,5,3]
+; CHECK-NEXT: vptestnmq %ymm2, %ymm2, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2
@@ -2231,22 +2172,13 @@ define <4 x i64> @test_masked_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64
}
define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,6,5,3]
-; CHECK-FAST-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3]
-; CHECK-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-FAST-PERLANE-NEXT: vptestnmq %ymm1, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [7,6,5,3]
+; CHECK-NEXT: vptestnmq %ymm1, %ymm1, %k1
+; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 7, i32 6, i32 5, i32 3>
%cmp = icmp eq <4 x i64> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -2319,29 +2251,48 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask0(<8 x i64> %vec, <2 x i
}
define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %vec2, <2 x i64> %mask) {
-; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vptestnmq %xmm2, %xmm2, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,3]
-; CHECK-NEXT: vmovdqa %xmm1, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [6,5,6,7]
+; CHECK-FAST-NEXT: vptestnmq %xmm2, %xmm2, %k1
+; CHECK-FAST-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-FAST-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mask1:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
+; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm2, %xmm2, %k1
+; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,2,3]
+; CHECK-FAST-PERLANE-NEXT: vmovdqa %xmm1, %xmm0
+; CHECK-FAST-PERLANE-NEXT: vzeroupper
+; CHECK-FAST-PERLANE-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
%cmp = icmp eq <2 x i64> %mask, zeroinitializer
%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> %vec2
ret <2 x i64> %res
}
define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mask1(<8 x i64> %vec, <2 x i64> %mask) {
-; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [6,5,6,7]
+; CHECK-FAST-NEXT: vptestnmq %xmm1, %xmm1, %k1
+; CHECK-FAST-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mask1:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm1, %xmm1, %k1
+; CHECK-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,2,3]
+; CHECK-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-FAST-PERLANE-NEXT: vzeroupper
+; CHECK-FAST-PERLANE-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 5>
%cmp = icmp eq <2 x i64> %mask, zeroinitializer
%res = select <2 x i1> %cmp, <2 x i64> %shuf, <2 x i64> zeroinitializer
@@ -2709,41 +2660,27 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(ptr %vp, <4 x i64>
}
define <2 x i64> @test_8xi64_to_2xi64_perm_mem_mask0(ptr %vp) {
-; CHECK-FAST-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,1]
-; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0
-; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-FAST-NEXT: vzeroupper
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm0
-; CHECK-FAST-PERLANE-NEXT: vblendps $12, (%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0,1],mem[2,3]
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_8xi64_to_2xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,1]
+; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%vec = load <8 x i64>, ptr %vp
%res = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
ret <2 x i64> %res
}
define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,1]
-; CHECK-FAST-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-FAST-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1}
-; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-FAST-NEXT: vzeroupper
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2
-; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm2, %xmm2 # xmm2 = xmm2[0,1],mem[2,3]
-; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm2, %xmm0 {%k1}
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm2 = [4,1]
+; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
+; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%vec = load <8 x i64>, ptr %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
%cmp = icmp eq <2 x i64> %mask, zeroinitializer
@@ -2752,22 +2689,14 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %
}
define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,1]
-; CHECK-FAST-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-FAST-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-FAST-NEXT: vzeroupper
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm1
-; CHECK-FAST-PERLANE-NEXT: vpblendd $12, (%rdi), %xmm1, %xmm1 # xmm1 = xmm1[0,1],mem[2,3]
-; CHECK-FAST-PERLANE-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-FAST-PERLANE-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm1 = [4,1]
+; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
+; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%vec = load <8 x i64>, ptr %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 4, i32 1>
%cmp = icmp eq <2 x i64> %mask, zeroinitializer
@@ -2778,9 +2707,12 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask0(ptr %vp, <2 x i64>
define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %vec2, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_8xi64_to_2xi64_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 48(%rdi), %xmm2
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm2 = [6,2]
; CHECK-NEXT: vptestnmq %xmm1, %xmm1, %k1
-; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
+; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <8 x i64>, ptr %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
@@ -2792,9 +2724,11 @@ define <2 x i64> @test_masked_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %
define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64> %mask) {
; CHECK-LABEL: test_masked_z_8xi64_to_2xi64_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovdqa 48(%rdi), %xmm1
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm1 = [6,2]
; CHECK-NEXT: vptestnmq %xmm0, %xmm0, %k1
-; CHECK-NEXT: vpunpcklqdq 16(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
+; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <8 x i64>, ptr %vp
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <2 x i32> <i32 6, i32 2>
@@ -2804,25 +2738,44 @@ define <2 x i64> @test_masked_z_8xi64_to_2xi64_perm_mem_mask1(ptr %vp, <2 x i64>
}
define <4 x float> @test_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec) {
-; CHECK-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,3,4,5]
+; CHECK-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_8xfloat_to_4xfloat_perm_mask0:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,1]
+; CHECK-FAST-PERLANE-NEXT: vzeroupper
+; CHECK-FAST-PERLANE-NEXT: retq
%res = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
ret <4 x float> %res
}
define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
-; CHECK-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
-; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
-; CHECK-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
-; CHECK-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
+; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,3,4,5]
+; CHECK-FAST-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; CHECK-FAST-NEXT: vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1}
+; CHECK-FAST-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xfloat_to_4xfloat_perm_mask0:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 {%k1} = xmm0[0,3],xmm3[0,1]
+; CHECK-FAST-PERLANE-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-FAST-PERLANE-NEXT: vzeroupper
+; CHECK-FAST-PERLANE-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
@@ -2830,14 +2783,24 @@ define <4 x float> @test_masked_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec,
}
define <4 x float> @test_masked_z_8xfloat_to_4xfloat_perm_mask0(<8 x float> %vec, <4 x float> %mask) {
-; CHECK-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
-; CHECK-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,3,4,5]
+; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z}
+; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xfloat_to_4xfloat_perm_mask0:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-FAST-PERLANE-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3],xmm2[0,1]
+; CHECK-FAST-PERLANE-NEXT: vzeroupper
+; CHECK-FAST-PERLANE-NEXT: retq
%shuf = shufflevector <8 x float> %vec, <8 x float> undef, <4 x i32> <i32 0, i32 3, i32 4, i32 5>
%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
@@ -3298,10 +3261,42 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask0(<16 x float> %v
ret <4 x float> %res
}
define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
-; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
+; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [8,6,10,6]
+; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqps %xmm4, %xmm2, %k1
+; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
+ ret <4 x float> %res
+}
+
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
+; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [8,6,10,6]
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
+ %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
+ %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
+ ret <4 x float> %res
+}
+define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
+; CHECK-FAST-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
; CHECK-FAST: # %bb.0:
; CHECK-FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} xmm3 = [8,6,10,6]
+; CHECK-FAST-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm3
; CHECK-FAST-NEXT: vxorps %xmm4, %xmm4, %xmm4
; CHECK-FAST-NEXT: vcmpeqps %xmm4, %xmm2, %k1
; CHECK-FAST-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1}
@@ -3309,27 +3304,26 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec
; CHECK-FAST-NEXT: vzeroupper
; CHECK-FAST-NEXT: retq
;
-; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask1:
+; CHECK-FAST-PERLANE-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3
-; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0
-; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,6,2,6]
-; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm0, %xmm3, %xmm4
-; CHECK-FAST-PERLANE-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm0, %xmm2, %k1
-; CHECK-FAST-PERLANE-NEXT: vblendmps %xmm4, %xmm1, %xmm0 {%k1}
+; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3
+; CHECK-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
+; CHECK-FAST-PERLANE-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm3, %xmm2, %k1
+; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1}
+; CHECK-FAST-PERLANE-NEXT: vmovaps %xmm1, %xmm0
; CHECK-FAST-PERLANE-NEXT: vzeroupper
; CHECK-FAST-PERLANE-NEXT: retq
- %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
+ %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
ret <4 x float> %res
}
-define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %vec, <4 x float> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
+define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
+; CHECK-FAST-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [8,6,10,6]
+; CHECK-FAST-NEXT: vmovaps {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2
; CHECK-FAST-NEXT: vxorps %xmm3, %xmm3, %xmm3
; CHECK-FAST-NEXT: vcmpeqps %xmm3, %xmm1, %k1
; CHECK-FAST-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z}
@@ -3337,48 +3331,15 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask1(<16 x float> %v
; CHECK-FAST-NEXT: vzeroupper
; CHECK-FAST-NEXT: retq
;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask1:
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2
-; CHECK-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3
-; CHECK-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,2,6]
-; CHECK-FAST-PERLANE-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm4, %xmm1, %k1
-; CHECK-FAST-PERLANE-NEXT: vpermi2ps %xmm3, %xmm2, %xmm0 {%k1} {z}
+; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2
+; CHECK-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
+; CHECK-FAST-PERLANE-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-FAST-PERLANE-NEXT: vcmpeqps %xmm2, %xmm1, %k1
+; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
; CHECK-FAST-PERLANE-NEXT: vzeroupper
; CHECK-FAST-PERLANE-NEXT: retq
- %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 8, i32 6, i32 10, i32 6>
- %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
- %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
- ret <4 x float> %res
-}
-define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) {
-; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[0,1],ymm3[4,4],ymm0[4,5]
-; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1
-; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1}
-; CHECK-NEXT: vmovaps %xmm1, %xmm0
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
- %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
- %cmp = fcmp oeq <4 x float> %mask, zeroinitializer
- %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec2
- ret <4 x float> %res
-}
-
-define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask2(<16 x float> %vec, <4 x float> %mask) {
-; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask2:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[0,1],ymm2[4,4],ymm0[4,5]
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
%shuf = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> <i32 12, i32 12, i32 4, i32 5>
%cmp = fcmp oeq <4 x float> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer
@@ -3604,9 +3565,9 @@ define <8 x float> @test_masked_z_16xfloat_to_8xfloat_perm_mem_mask3(ptr %vp, <8
define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm1 # ymm1 = mem[3,1,2,3]
-; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,6,7,3]
-; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm1, %xmm0
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm0 = [14,6,7,11]
+; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, ptr %vp
@@ -3616,12 +3577,12 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp) {
define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %vec2, <4 x float> %mask) {
; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
-; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,6,7,3]
-; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm3
-; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1
-; CHECK-NEXT: vmovaps %xmm3, %xmm0 {%k1}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm2 = [14,6,7,11]
+; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqps %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, ptr %vp
@@ -3634,12 +3595,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x
define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0(ptr %vp, <4 x float> %mask) {
; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mem_mask0:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpermpd $231, 32(%rdi), %ymm2 # ymm2 = mem[3,1,2,3]
-; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,6,7,3]
-; CHECK-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqps %xmm3, %xmm0, %k1
-; CHECK-NEXT: vpermi2ps 16(%rdi), %xmm2, %xmm1 {%k1} {z}
-; CHECK-NEXT: vmovaps %xmm1, %xmm0
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [14,6,7,11]
+; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqps %xmm2, %xmm0, %k1
+; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <16 x float>, ptr %vp
@@ -4045,25 +4005,15 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask3(<8 x double>
ret <4 x double> %res
}
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,5]
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm0
-; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-FAST-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
-; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm3
-; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
-; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,0,1,1]
-; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,1,5,5]
+; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
@@ -4071,23 +4021,14 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask4(<8 x double> %v
}
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask4(<8 x double> %vec, <4 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,5]
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0
-; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-FAST-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-FAST-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $2, %zmm0, %xmm2
-; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
-; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm2, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,0,1,1]
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,1,5,5]
+; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 1, i32 1, i32 5, i32 5>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
@@ -4124,42 +4065,25 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double>
ret <4 x double> %res
}
define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) {
-; CHECK-FAST-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [5,0,7,0]
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
-; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm1 = [5,0,7,0]
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT: retq
%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
ret <4 x double> %res
}
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [5,0,7,0]
-; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
-; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
-; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm3[1],ymm0[1],ymm3[3],ymm0[3]
-; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [5,0,7,0]
+; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
@@ -4167,48 +4091,29 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask6(<8 x double> %v
}
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,0,7,0]
-; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0
-; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [5,0,7,0]
+; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 5, i32 0, i32 7, i32 0>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
ret <4 x double> %res
}
define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,5,0,6]
-; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
-; CHECK-FAST-NEXT: vmovapd %ymm1, %ymm0
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm3
-; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3]
-; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
-; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm1 {%k1} = ymm0[1],ymm3[1],ymm0[2],ymm3[2]
-; CHECK-FAST-PERLANE-NEXT: vmovapd %ymm1, %ymm0
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,5,0,6]
+; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %ymm4, %ymm2, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2
@@ -4216,67 +4121,41 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask7(<8 x double> %v
}
define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,5,0,6]
-; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf64x4 $1, %zmm0, %ymm2
-; CHECK-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,0,3]
-; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
-; CHECK-FAST-PERLANE-NEXT: vshufpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm2[1],ymm0[2],ymm2[2]
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} ymm2 = [3,5,0,6]
+; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 3, i32 5, i32 0, i32 6>
%cmp = fcmp oeq <4 x double> %mask, zeroinitializer
%res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer
ret <4 x double> %res
}
define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) {
-; CHECK-FAST-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,6]
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm1, %zmm0
-; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-FAST-NEXT: vzeroupper
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm1
-; CHECK-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-FAST-PERLANE-NEXT: vzeroupper
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm1 = [0,6]
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
ret <2 x double> %res
}
define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm3 = [0,6]
-; CHECK-FAST-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
-; CHECK-FAST-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FAST-NEXT: vzeroupper
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm3
-; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm4, %xmm4, %xmm4
-; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
-; CHECK-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],xmm3[0]
-; CHECK-FAST-PERLANE-NEXT: vmovapd %xmm1, %xmm0
-; CHECK-FAST-PERLANE-NEXT: vzeroupper
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm3 = [0,6]
+; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
+; CHECK-NEXT: vcmpeqpd %xmm4, %xmm2, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1}
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec2
@@ -4284,24 +4163,15 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %v
}
define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) {
-; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
-; CHECK-FAST: # %bb.0:
-; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,6]
-; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-FAST-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
-; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; CHECK-FAST-NEXT: vzeroupper
-; CHECK-FAST-NEXT: retq
-;
-; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
-; CHECK-FAST-PERLANE: # %bb.0:
-; CHECK-FAST-PERLANE-NEXT: vextractf32x4 $3, %zmm0, %xmm2
-; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm2[0]
-; CHECK-FAST-PERLANE-NEXT: vzeroupper
-; CHECK-FAST-PERLANE-NEXT: retq
+; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,6]
+; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z}
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 0, i32 6>
%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
%res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer
@@ -4668,23 +4538,42 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(ptr %vp,
}
define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp) {
-; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %xmm0
-; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0]
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,6]
+; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %xmm0
+; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[1],mem[0]
+; CHECK-FAST-PERLANE-NEXT: retq
%vec = load <8 x double>, ptr %vp
%res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
ret <2 x double> %res
}
define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
-; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %xmm2
-; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0]
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,6]
+; CHECK-FAST-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1}
+; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %xmm2
+; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
+; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 48(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[1],mem[0]
+; CHECK-FAST-PERLANE-NEXT: retq
%vec = load <8 x double>, ptr %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
@@ -4693,13 +4582,23 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2
}
define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp, <2 x double> %mask) {
-; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vmovapd (%rdi), %xmm1
-; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT: vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0]
-; CHECK-NEXT: retq
+; CHECK-FAST-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vpmovsxbq {{.*#+}} xmm1 = [1,6]
+; CHECK-FAST-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
+; CHECK-FAST-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; CHECK-FAST-NEXT: vzeroupper
+; CHECK-FAST-NEXT: retq
+;
+; CHECK-FAST-PERLANE-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0:
+; CHECK-FAST-PERLANE: # %bb.0:
+; CHECK-FAST-PERLANE-NEXT: vmovapd (%rdi), %xmm1
+; CHECK-FAST-PERLANE-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-FAST-PERLANE-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
+; CHECK-FAST-PERLANE-NEXT: vshufpd $1, 48(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[1],mem[0]
+; CHECK-FAST-PERLANE-NEXT: retq
%vec = load <8 x double>, ptr %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 6>
%cmp = fcmp oeq <2 x double> %mask, zeroinitializer
@@ -4710,10 +4609,13 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask0(ptr %vp,
define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %vec2, <2 x double> %mask) {
; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mem_mask1:
; CHECK: # %bb.0:
-; CHECK-NEXT: vmovddup 8(%rdi), %xmm2 # xmm2 = mem[0,0]
-; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1
-; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm2, %xmm0 {%k1} # xmm0 {%k1} = xmm2[0],mem[0]
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,12]
+; CHECK-NEXT: vmovddup 8(%rdi), %xmm3 # xmm3 = mem[0,0]
+; CHECK-NEXT: vpermt2pd (%rdi), %zmm2, %zmm3
+; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1
+; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1}
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <8 x double>, ptr %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
@@ -4725,10 +4627,13 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2
define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1(ptr %vp, <2 x double> %mask) {
; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mem_mask1:
; CHECK: # %bb.0:
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,12]
; CHECK-NEXT: vmovddup 8(%rdi), %xmm1 # xmm1 = mem[0,0]
-; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1
-; CHECK-NEXT: vunpcklpd 32(%rdi), %xmm1, %xmm0 {%k1} {z} # xmm0 {%k1} {z} = xmm1[0],mem[0]
+; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vcmpeqpd %xmm3, %xmm0, %k1
+; CHECK-NEXT: vpermt2pd (%rdi), %zmm2, %zmm1 {%k1} {z}
+; CHECK-NEXT: vmovapd %xmm1, %xmm0
+; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%vec = load <8 x double>, ptr %vp
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> <i32 1, i32 4>
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 8449107f39e91..9eb92b297777f 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -1548,14 +1548,15 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
; X86: # %bb.0:
; X86-NEXT: subl $204, %esp
; X86-NEXT: vmovups %zmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 64-byte Spill
-; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: vmovlps %xmm0, (%esp)
+; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-NEXT: vmovlps %xmm1, (%esp)
; X86-NEXT: vzeroupper
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: vmovhps %xmm0, (%esp)
+; X86-NEXT: vpermpd $235, {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; X86-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7]
+; X86-NEXT: vmovlps %xmm0, (%esp)
+; X86-NEXT: vzeroupper
; X86-NEXT: calll __truncdfbf2
; X86-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -1672,20 +1673,22 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
; FP16: # %bb.0:
; FP16-NEXT: subq $184, %rsp
; FP16-NEXT: vmovupd %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; FP16-NEXT: vextractf128 $1, %ymm0, %xmm0
-; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; FP16-NEXT: vzeroupper
; FP16-NEXT: callq __truncdfbf2 at PLT
; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; FP16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; FP16-NEXT: vzeroupper
; FP16-NEXT: callq __truncdfbf2 at PLT
; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; FP16-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; FP16-NEXT: # xmm0 = mem[1,0]
-; FP16-NEXT: callq __truncdfbf2 at PLT
-; FP16-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; FP16-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; FP16-NEXT: vextractf128 $1, %ymm0, %xmm0
+; FP16-NEXT: vzeroupper
+; FP16-NEXT: callq __truncdfbf2 at PLT
+; FP16-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FP16-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; FP16-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7]
; FP16-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; FP16-NEXT: vzeroupper
; FP16-NEXT: callq __truncdfbf2 at PLT
@@ -1721,7 +1724,7 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
; FP16-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; FP16-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
; FP16-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; FP16-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; FP16-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; FP16-NEXT: addq $184, %rsp
; FP16-NEXT: retq
@@ -1745,13 +1748,12 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
; AVXNC-NEXT: # xmm0 = mem[1,0]
; AVXNC-NEXT: callq __truncdfbf2 at PLT
; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVXNC-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVXNC-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVXNC-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVXNC-NEXT: # ymm0 = mem[3,2,2,3]
+; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2 at PLT
-; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVXNC-NEXT: vzeroupper
@@ -1763,15 +1765,16 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
; AVXNC-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVXNC-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2 at PLT
-; AVXNC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVXNC-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVXNC-NEXT: # xmm0 = mem[1,0]
+; AVXNC-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVXNC-NEXT: vpermpd $235, (%rsp), %ymm0 # 32-byte Folded Reload
+; AVXNC-NEXT: # ymm0 = mem[3,2,2,3]
+; AVXNC-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2 at PLT
; AVXNC-NEXT: vpextrw $0, %xmm0, %ebx
-; AVXNC-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %ebp
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %r14d
@@ -1781,7 +1784,9 @@ define <8 x bfloat> @fptrunc_v8f64(<8 x double> %a) nounwind {
; AVXNC-NEXT: vpextrw $0, %xmm0, %r12d
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVXNC-NEXT: vpextrw $0, %xmm0, %r13d
-; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVXNC-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVXNC-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVXNC-NEXT: vzeroupper
; AVXNC-NEXT: callq __truncdfbf2 at PLT
; AVXNC-NEXT: vpextrw $0, %xmm0, %eax
; AVXNC-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll
index 0748ca626bcf8..c100f15a68ca2 100644
--- a/llvm/test/CodeGen/X86/combine-pmuldq.ll
+++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll
@@ -389,17 +389,18 @@ define <8 x i32> @PR49658_zext(ptr %ptr, i32 %mul) {
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,7]
; AVX2-NEXT: .p2align 4
; AVX2-NEXT: .LBB7_1: # %loop
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm4
; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm4[1,3],ymm2[5,7],ymm4[5,7]
-; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpermd %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpermd %ymm4, %ymm2, %ymm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: vpaddd %ymm0, %ymm3, %ymm0
; AVX2-NEXT: subq $-128, %rax
; AVX2-NEXT: jne .LBB7_1
; AVX2-NEXT: # %bb.2: # %end
@@ -565,17 +566,18 @@ define <8 x i32> @PR49658_sext(ptr %ptr, i32 %mul) {
; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1
; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,3,5,7]
; AVX2-NEXT: .p2align 4
; AVX2-NEXT: .LBB8_1: # %loop
; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
-; AVX2-NEXT: vpmovsxdq 2097168(%rdi,%rax), %ymm2
; AVX2-NEXT: vpmovsxdq 2097152(%rdi,%rax), %ymm3
+; AVX2-NEXT: vpmovsxdq 2097168(%rdi,%rax), %ymm4
+; AVX2-NEXT: vpmuldq %ymm4, %ymm1, %ymm4
; AVX2-NEXT: vpmuldq %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm4[1,3],ymm2[5,7],ymm4[5,7]
-; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpermd %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpermd %ymm4, %ymm2, %ymm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: vpaddd %ymm0, %ymm3, %ymm0
; AVX2-NEXT: subq $-128, %rax
; AVX2-NEXT: jne .LBB8_1
; AVX2-NEXT: # %bb.2: # %end
diff --git a/llvm/test/CodeGen/X86/combine-sra.ll b/llvm/test/CodeGen/X86/combine-sra.ll
index c982884314f62..c7ce895853f46 100644
--- a/llvm/test/CodeGen/X86/combine-sra.ll
+++ b/llvm/test/CodeGen/X86/combine-sra.ll
@@ -288,29 +288,13 @@ define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
-; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-SLOW-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_lshr:
-; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-ALL-NEXT: vzeroupper
-; AVX2-FAST-ALL-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_lshr:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-FAST-PERLANE-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vzeroupper
-; AVX2-FAST-PERLANE-NEXT: retq
+; AVX2-LABEL: combine_vec_ashr_trunc_lshr:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: combine_vec_ashr_trunc_lshr:
; AVX512: # %bb.0:
@@ -389,29 +373,13 @@ define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
; SSE41-NEXT: retq
;
-; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-SLOW-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_ashr:
-; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7]
-; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-ALL-NEXT: vzeroupper
-; AVX2-FAST-ALL-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_ashr:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-FAST-PERLANE-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vzeroupper
-; AVX2-FAST-PERLANE-NEXT: retq
+; AVX2-LABEL: combine_vec_ashr_trunc_ashr:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: combine_vec_ashr_trunc_ashr:
; AVX512: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
index d9393ba9febb2..0628db719c5bd 100644
--- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
+++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll
@@ -419,12 +419,19 @@ define <8 x float> @vec256_eltty_float_source_subvec_1_target_subvec_mask_1_unar
}
define <8 x float> @vec256_eltty_float_source_subvec_1_target_subvec_mask_1_binary(<8 x float> %x, <8 x float> %y) nounwind {
-; CHECK-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_1_binary:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT: vbroadcastss %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
-; CHECK-NEXT: retq
+; CHECK-SLOW-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK-SLOW: # %bb.0:
+; CHECK-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; CHECK-SLOW-NEXT: vbroadcastss %xmm1, %xmm1
+; CHECK-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; CHECK-SLOW-NEXT: retq
+;
+; CHECK-FAST-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,4,4,4]
+; CHECK-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; CHECK-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; CHECK-FAST-NEXT: retq
%r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 12, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %r
}
@@ -547,12 +554,19 @@ define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_unary(<8
}
define <8 x i32> @vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_binary(<8 x i32> %x, <8 x i32> %y) nounwind {
-; CHECK-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_binary:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
-; CHECK-NEXT: vbroadcastss %xmm1, %xmm1
-; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
-; CHECK-NEXT: retq
+; CHECK-SLOW-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK-SLOW: # %bb.0:
+; CHECK-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; CHECK-SLOW-NEXT: vbroadcastss %xmm1, %xmm1
+; CHECK-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; CHECK-SLOW-NEXT: retq
+;
+; CHECK-FAST-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_1_binary:
+; CHECK-FAST: # %bb.0:
+; CHECK-FAST-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,4,4,4]
+; CHECK-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; CHECK-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
+; CHECK-FAST-NEXT: retq
%r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 12, i32 4, i32 5, i32 6, i32 7>
ret <8 x i32> %r
}
diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll
index 94fa81742ba71..cc7d8e61505f5 100644
--- a/llvm/test/CodeGen/X86/haddsub-undef.ll
+++ b/llvm/test/CodeGen/X86/haddsub-undef.ll
@@ -1110,17 +1110,17 @@ define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) {
; SSE-FAST-NEXT: movapd %xmm2, %xmm1
; SSE-FAST-NEXT: retq
;
-; AVX-SLOW-LABEL: PR34724_add_v4f64_u123:
-; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
-; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
-; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
-; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-SLOW-NEXT: retq
+; AVX1-SLOW-LABEL: PR34724_add_v4f64_u123:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
+; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: PR34724_add_v4f64_u123:
; AVX-FAST: # %bb.0:
@@ -1129,6 +1129,18 @@ define <4 x double> @PR34724_add_v4f64_u123(<4 x double> %0, <4 x double> %1) {
; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX-FAST-NEXT: vhaddpd %ymm0, %ymm1, %ymm0
; AVX-FAST-NEXT: retq
+;
+; AVX512-SLOW-LABEL: PR34724_add_v4f64_u123:
+; AVX512-SLOW: # %bb.0:
+; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX512-SLOW-NEXT: vaddsd %xmm1, %xmm2, %xmm1
+; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
+; AVX512-SLOW-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX512-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-SLOW-NEXT: retq
%3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 2, i32 4>
%4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 3, i32 5>
%5 = fadd <2 x double> %3, %4
@@ -1161,21 +1173,31 @@ define <4 x double> @PR34724_add_v4f64_0u23(<4 x double> %0, <4 x double> %1) {
; SSE-FAST-NEXT: haddpd %xmm3, %xmm1
; SSE-FAST-NEXT: retq
;
-; AVX-SLOW-LABEL: PR34724_add_v4f64_0u23:
-; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
-; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
-; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-SLOW-NEXT: retq
+; AVX1-SLOW-LABEL: PR34724_add_v4f64_0u23:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
+; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: PR34724_add_v4f64_0u23:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; AVX-FAST-NEXT: retq
+;
+; AVX512-SLOW-LABEL: PR34724_add_v4f64_0u23:
+; AVX512-SLOW: # %bb.0:
+; AVX512-SLOW-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
+; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX512-SLOW-NEXT: vaddsd %xmm1, %xmm2, %xmm1
+; AVX512-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1],xmm1[0]
+; AVX512-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-SLOW-NEXT: retq
%3 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 0, i32 4>
%4 = shufflevector <4 x double> %0, <4 x double> %1, <2 x i32> <i32 1, i32 5>
%5 = fadd <2 x double> %3, %4
@@ -1207,16 +1229,16 @@ define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) {
; SSE-FAST-NEXT: movapd %xmm3, %xmm1
; SSE-FAST-NEXT: retq
;
-; AVX-SLOW-LABEL: PR34724_add_v4f64_01u3:
-; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
-; AVX-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
-; AVX-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
-; AVX-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-SLOW-NEXT: retq
+; AVX1-SLOW-LABEL: PR34724_add_v4f64_01u3:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-SLOW-NEXT: retq
;
; AVX1-FAST-LABEL: PR34724_add_v4f64_01u3:
; AVX1-FAST: # %bb.0:
@@ -1225,6 +1247,17 @@ define <4 x double> @PR34724_add_v4f64_01u3(<4 x double> %0, <4 x double> %1) {
; AVX1-FAST-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
; AVX1-FAST-NEXT: retq
;
+; AVX512-SLOW-LABEL: PR34724_add_v4f64_01u3:
+; AVX512-SLOW: # %bb.0:
+; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX512-SLOW-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
+; AVX512-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX512-SLOW-NEXT: vaddsd %xmm1, %xmm2, %xmm1
+; AVX512-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0]
+; AVX512-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512-SLOW-NEXT: retq
+;
; AVX512-FAST-LABEL: PR34724_add_v4f64_01u3:
; AVX512-FAST: # %bb.0:
; AVX512-FAST-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll
index a0778195b5c73..fbc3e20365b8b 100644
--- a/llvm/test/CodeGen/X86/haddsub.ll
+++ b/llvm/test/CodeGen/X86/haddsub.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE3,SSE3-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE3,SSE3-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX1-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX2-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX-SLOW,AVX512-SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,fast-hops | FileCheck %s --check-prefixes=AVX,AVX-FAST
define <2 x double> @haddpd1(<2 x double> %x, <2 x double> %y) {
@@ -1038,13 +1038,13 @@ define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
; SSE3-FAST-NEXT: retq
;
-; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
-; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-SLOW-NEXT: vzeroupper
-; AVX-SLOW-NEXT: retq
+; AVX1-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64:
; AVX-FAST: # %bb.0:
@@ -1052,6 +1052,22 @@ define double @extract_extract23_v4f64_fadd_f64(<4 x double> %x) {
; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vzeroupper
; AVX-FAST-NEXT: retq
+;
+; AVX2-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX512-SLOW-LABEL: extract_extract23_v4f64_fadd_f64:
+; AVX512-SLOW: # %bb.0:
+; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX512-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX512-SLOW-NEXT: vzeroupper
+; AVX512-SLOW-NEXT: retq
%x0 = extractelement <4 x double> %x, i32 2
%x1 = extractelement <4 x double> %x, i32 3
%x01 = fadd double %x0, %x1
@@ -1103,13 +1119,13 @@ define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
; SSE3-FAST-NEXT: haddpd %xmm1, %xmm0
; SSE3-FAST-NEXT: retq
;
-; AVX-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
-; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
-; AVX-SLOW-NEXT: vzeroupper
-; AVX-SLOW-NEXT: retq
+; AVX1-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: extract_extract23_v4f64_fadd_f64_commute:
; AVX-FAST: # %bb.0:
@@ -1117,6 +1133,22 @@ define double @extract_extract23_v4f64_fadd_f64_commute(<4 x double> %x) {
; AVX-FAST-NEXT: vhaddpd %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vzeroupper
; AVX-FAST-NEXT: retq
+;
+; AVX2-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX512-SLOW-LABEL: extract_extract23_v4f64_fadd_f64_commute:
+; AVX512-SLOW: # %bb.0:
+; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX512-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX512-SLOW-NEXT: vzeroupper
+; AVX512-SLOW-NEXT: retq
%x0 = extractelement <4 x double> %x, i32 2
%x1 = extractelement <4 x double> %x, i32 3
%x01 = fadd double %x1, %x0
diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll
index 45b61155fe626..07abc1bb14010 100644
--- a/llvm/test/CodeGen/X86/known-signbits-vector.ll
+++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll
@@ -248,36 +248,11 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin
}
define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) nounwind {
-; X86-LABEL: signbits_sext_shuffle_sitofp:
-; X86: # %bb.0:
-; X86-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; X86-NEXT: vcvtdq2pd %xmm0, %ymm0
-; X86-NEXT: retl
-;
-; X64-AVX1-LABEL: signbits_sext_shuffle_sitofp:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
-; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; X64-AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: signbits_sext_shuffle_sitofp:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,1,0]
-; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; X64-AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
-; X64-AVX2-NEXT: retq
+; CHECK-LABEL: signbits_sext_shuffle_sitofp:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
+; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
%1 = sext <4 x i32> %a0 to <4 x i64>
%2 = shufflevector <4 x i64> %1, <4 x i64>%a1, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
%3 = sitofp <4 x i64> %2 to <4 x double>
@@ -405,15 +380,11 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $16, %esp
; X86-NEXT: vmovapd 8(%ebp), %xmm3
-; X86-NEXT: vpsrad $31, %xmm2, %xmm4
-; X86-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; X86-NEXT: vpsrad $1, %xmm5, %xmm5
-; X86-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; X86-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; X86-NEXT: vpsrad $1, %xmm4, %xmm4
; X86-NEXT: vextractf128 $1, %ymm2, %xmm2
-; X86-NEXT: vpsrad $31, %xmm2, %xmm5
; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; X86-NEXT: vpsrad $1, %xmm2, %xmm2
-; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
; X86-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3]
; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6
; X86-NEXT: vextractf128 $1, %ymm1, %xmm1
@@ -421,10 +392,7 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm5, %xmm0
; X86-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm1
-; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X86-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,0]
; X86-NEXT: vcvtdq2ps %xmm0, %xmm0
; X86-NEXT: movl %ebp, %esp
; X86-NEXT: popl %ebp
@@ -433,15 +401,11 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
;
; X64-AVX1-LABEL: signbits_ashr_sext_select_shuffle_sitofp:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vpsrad $31, %xmm2, %xmm4
-; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; X64-AVX1-NEXT: vpsrad $1, %xmm5, %xmm5
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; X64-AVX1-NEXT: vpsrad $1, %xmm4, %xmm4
; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; X64-AVX1-NEXT: vpsrad $31, %xmm2, %xmm5
; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
; X64-AVX1-NEXT: vpsrad $1, %xmm2, %xmm2
-; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
; X64-AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3]
; X64-AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6
; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
@@ -449,10 +413,7 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
; X64-AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
; X64-AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm5, %xmm0
; X64-AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm3, %xmm1
-; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; X64-AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
-; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,0],xmm0[0,0]
; X64-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
@@ -464,9 +425,8 @@ define <4 x float> @signbits_ashr_sext_select_shuffle_sitofp(<4 x i64> %a0, <4 x
; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
; X64-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0
-; X64-AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5]
-; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm1 = [0,0,4,4]
+; X64-AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0
; X64-AVX2-NEXT: vzeroupper
; X64-AVX2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index bdb7c307a5759..cb3016c603a0a 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -1892,8 +1892,10 @@ define <4 x i32> @larger_mul(<16 x i16> %A, <16 x i16> %B) {
; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512-NEXT: vpmovsxwd %ymm1, %zmm1
; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm1
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
+; AVX512-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%a = sext <16 x i16> %A to <16 x i32>
diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll
index 3187bf6448690..1afb26adaf323 100644
--- a/llvm/test/CodeGen/X86/masked_compressstore.ll
+++ b/llvm/test/CodeGen/X86/masked_compressstore.ll
@@ -155,21 +155,14 @@ define void @compressstore_v8f64_v8i1(ptr %base, <8 x double> %V, <8 x i1> %mask
; AVX2-NEXT: vpacksswb %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpmovmskb %xmm2, %eax
; AVX2-NEXT: testb $1, %al
-; AVX2-NEXT: je LBB0_2
-; AVX2-NEXT: ## %bb.1: ## %cond.store
-; AVX2-NEXT: vmovq %xmm0, (%rdi)
-; AVX2-NEXT: addq $8, %rdi
-; AVX2-NEXT: LBB0_2: ## %else
+; AVX2-NEXT: jne LBB0_1
+; AVX2-NEXT: ## %bb.2: ## %else
; AVX2-NEXT: testb $2, %al
-; AVX2-NEXT: je LBB0_4
-; AVX2-NEXT: ## %bb.3: ## %cond.store1
-; AVX2-NEXT: vmovhpd %xmm0, (%rdi)
-; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: jne LBB0_3
; AVX2-NEXT: LBB0_4: ## %else2
; AVX2-NEXT: testb $4, %al
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: jne LBB0_5
-; AVX2-NEXT: ## %bb.6: ## %else5
+; AVX2-NEXT: LBB0_6: ## %else5
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: jne LBB0_7
; AVX2-NEXT: LBB0_8: ## %else8
@@ -177,27 +170,35 @@ define void @compressstore_v8f64_v8i1(ptr %base, <8 x double> %V, <8 x i1> %mask
; AVX2-NEXT: jne LBB0_9
; AVX2-NEXT: LBB0_10: ## %else11
; AVX2-NEXT: testb $32, %al
-; AVX2-NEXT: je LBB0_12
-; AVX2-NEXT: LBB0_11: ## %cond.store13
-; AVX2-NEXT: vmovhps %xmm1, (%rdi)
-; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: jne LBB0_11
; AVX2-NEXT: LBB0_12: ## %else14
; AVX2-NEXT: testb $64, %al
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX2-NEXT: jne LBB0_13
-; AVX2-NEXT: ## %bb.14: ## %else17
+; AVX2-NEXT: LBB0_14: ## %else17
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: jne LBB0_15
; AVX2-NEXT: LBB0_16: ## %else20
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
-; AVX2-NEXT: LBB0_5: ## %cond.store4
+; AVX2-NEXT: LBB0_1: ## %cond.store
; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: je LBB0_4
+; AVX2-NEXT: LBB0_3: ## %cond.store1
+; AVX2-NEXT: vmovhpd %xmm0, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testb $4, %al
+; AVX2-NEXT: je LBB0_6
+; AVX2-NEXT: LBB0_5: ## %cond.store4
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovq %xmm2, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $8, %al
; AVX2-NEXT: je LBB0_8
; AVX2-NEXT: LBB0_7: ## %cond.store7
-; AVX2-NEXT: vmovhpd %xmm0, (%rdi)
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT: vmovq %xmm0, (%rdi)
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $16, %al
; AVX2-NEXT: je LBB0_10
@@ -205,15 +206,21 @@ define void @compressstore_v8f64_v8i1(ptr %base, <8 x double> %V, <8 x i1> %mask
; AVX2-NEXT: vmovlps %xmm1, (%rdi)
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $32, %al
-; AVX2-NEXT: jne LBB0_11
-; AVX2-NEXT: jmp LBB0_12
+; AVX2-NEXT: je LBB0_12
+; AVX2-NEXT: LBB0_11: ## %cond.store13
+; AVX2-NEXT: vmovhps %xmm1, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testb $64, %al
+; AVX2-NEXT: je LBB0_14
; AVX2-NEXT: LBB0_13: ## %cond.store16
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0
; AVX2-NEXT: vmovlps %xmm0, (%rdi)
; AVX2-NEXT: addq $8, %rdi
; AVX2-NEXT: testb $-128, %al
; AVX2-NEXT: je LBB0_16
; AVX2-NEXT: LBB0_15: ## %cond.store19
-; AVX2-NEXT: vmovhps %xmm0, (%rdi)
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[3,2,2,3]
+; AVX2-NEXT: vmovlps %xmm0, (%rdi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -380,136 +387,278 @@ define void @compressstore_v16f64_v16i1(ptr %base, <16 x double> %V, <16 x i1> %
; SSE-NEXT: movhps %xmm7, (%rdi)
; SSE-NEXT: retq
;
-; AVX1OR2-LABEL: compressstore_v16f64_v16i1:
-; AVX1OR2: ## %bb.0:
-; AVX1OR2-NEXT: vpsllw $7, %xmm4, %xmm4
-; AVX1OR2-NEXT: vpmovmskb %xmm4, %eax
-; AVX1OR2-NEXT: testb $1, %al
-; AVX1OR2-NEXT: je LBB1_2
-; AVX1OR2-NEXT: ## %bb.1: ## %cond.store
-; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: LBB1_2: ## %else
-; AVX1OR2-NEXT: testb $2, %al
-; AVX1OR2-NEXT: je LBB1_4
-; AVX1OR2-NEXT: ## %bb.3: ## %cond.store1
-; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: LBB1_4: ## %else2
-; AVX1OR2-NEXT: testb $4, %al
-; AVX1OR2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1OR2-NEXT: jne LBB1_5
-; AVX1OR2-NEXT: ## %bb.6: ## %else5
-; AVX1OR2-NEXT: testb $8, %al
-; AVX1OR2-NEXT: jne LBB1_7
-; AVX1OR2-NEXT: LBB1_8: ## %else8
-; AVX1OR2-NEXT: testb $16, %al
-; AVX1OR2-NEXT: jne LBB1_9
-; AVX1OR2-NEXT: LBB1_10: ## %else11
-; AVX1OR2-NEXT: testb $32, %al
-; AVX1OR2-NEXT: je LBB1_12
-; AVX1OR2-NEXT: LBB1_11: ## %cond.store13
-; AVX1OR2-NEXT: vmovhps %xmm1, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: LBB1_12: ## %else14
-; AVX1OR2-NEXT: testb $64, %al
-; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm0
-; AVX1OR2-NEXT: jne LBB1_13
-; AVX1OR2-NEXT: ## %bb.14: ## %else17
-; AVX1OR2-NEXT: testb %al, %al
-; AVX1OR2-NEXT: js LBB1_15
-; AVX1OR2-NEXT: LBB1_16: ## %else20
-; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100
-; AVX1OR2-NEXT: jne LBB1_17
-; AVX1OR2-NEXT: LBB1_18: ## %else23
-; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200
-; AVX1OR2-NEXT: je LBB1_20
-; AVX1OR2-NEXT: LBB1_19: ## %cond.store25
-; AVX1OR2-NEXT: vmovhps %xmm2, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: LBB1_20: ## %else26
-; AVX1OR2-NEXT: testl $1024, %eax ## imm = 0x400
-; AVX1OR2-NEXT: vextractf128 $1, %ymm2, %xmm0
-; AVX1OR2-NEXT: jne LBB1_21
-; AVX1OR2-NEXT: ## %bb.22: ## %else29
-; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800
-; AVX1OR2-NEXT: jne LBB1_23
-; AVX1OR2-NEXT: LBB1_24: ## %else32
-; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000
-; AVX1OR2-NEXT: jne LBB1_25
-; AVX1OR2-NEXT: LBB1_26: ## %else35
-; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000
-; AVX1OR2-NEXT: je LBB1_28
-; AVX1OR2-NEXT: LBB1_27: ## %cond.store37
-; AVX1OR2-NEXT: vmovhps %xmm3, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: LBB1_28: ## %else38
-; AVX1OR2-NEXT: testl $16384, %eax ## imm = 0x4000
-; AVX1OR2-NEXT: vextractf128 $1, %ymm3, %xmm0
-; AVX1OR2-NEXT: jne LBB1_29
-; AVX1OR2-NEXT: ## %bb.30: ## %else41
-; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000
-; AVX1OR2-NEXT: jne LBB1_31
-; AVX1OR2-NEXT: LBB1_32: ## %else44
-; AVX1OR2-NEXT: vzeroupper
-; AVX1OR2-NEXT: retq
-; AVX1OR2-NEXT: LBB1_5: ## %cond.store4
-; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: testb $8, %al
-; AVX1OR2-NEXT: je LBB1_8
-; AVX1OR2-NEXT: LBB1_7: ## %cond.store7
-; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: testb $16, %al
-; AVX1OR2-NEXT: je LBB1_10
-; AVX1OR2-NEXT: LBB1_9: ## %cond.store10
-; AVX1OR2-NEXT: vmovlps %xmm1, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: testb $32, %al
-; AVX1OR2-NEXT: jne LBB1_11
-; AVX1OR2-NEXT: jmp LBB1_12
-; AVX1OR2-NEXT: LBB1_13: ## %cond.store16
-; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: testb %al, %al
-; AVX1OR2-NEXT: jns LBB1_16
-; AVX1OR2-NEXT: LBB1_15: ## %cond.store19
-; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: testl $256, %eax ## imm = 0x100
-; AVX1OR2-NEXT: je LBB1_18
-; AVX1OR2-NEXT: LBB1_17: ## %cond.store22
-; AVX1OR2-NEXT: vmovlps %xmm2, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: testl $512, %eax ## imm = 0x200
-; AVX1OR2-NEXT: jne LBB1_19
-; AVX1OR2-NEXT: jmp LBB1_20
-; AVX1OR2-NEXT: LBB1_21: ## %cond.store28
-; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: testl $2048, %eax ## imm = 0x800
-; AVX1OR2-NEXT: je LBB1_24
-; AVX1OR2-NEXT: LBB1_23: ## %cond.store31
-; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: testl $4096, %eax ## imm = 0x1000
-; AVX1OR2-NEXT: je LBB1_26
-; AVX1OR2-NEXT: LBB1_25: ## %cond.store34
-; AVX1OR2-NEXT: vmovlps %xmm3, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: testl $8192, %eax ## imm = 0x2000
-; AVX1OR2-NEXT: jne LBB1_27
-; AVX1OR2-NEXT: jmp LBB1_28
-; AVX1OR2-NEXT: LBB1_29: ## %cond.store40
-; AVX1OR2-NEXT: vmovlps %xmm0, (%rdi)
-; AVX1OR2-NEXT: addq $8, %rdi
-; AVX1OR2-NEXT: testl $32768, %eax ## imm = 0x8000
-; AVX1OR2-NEXT: je LBB1_32
-; AVX1OR2-NEXT: LBB1_31: ## %cond.store43
-; AVX1OR2-NEXT: vmovhps %xmm0, (%rdi)
-; AVX1OR2-NEXT: vzeroupper
-; AVX1OR2-NEXT: retq
+; AVX1-LABEL: compressstore_v16f64_v16i1:
+; AVX1: ## %bb.0:
+; AVX1-NEXT: vpsllw $7, %xmm4, %xmm4
+; AVX1-NEXT: vpmovmskb %xmm4, %eax
+; AVX1-NEXT: testb $1, %al
+; AVX1-NEXT: je LBB1_2
+; AVX1-NEXT: ## %bb.1: ## %cond.store
+; AVX1-NEXT: vmovlps %xmm0, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: LBB1_2: ## %else
+; AVX1-NEXT: testb $2, %al
+; AVX1-NEXT: je LBB1_4
+; AVX1-NEXT: ## %bb.3: ## %cond.store1
+; AVX1-NEXT: vmovhps %xmm0, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: LBB1_4: ## %else2
+; AVX1-NEXT: testb $4, %al
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: jne LBB1_5
+; AVX1-NEXT: ## %bb.6: ## %else5
+; AVX1-NEXT: testb $8, %al
+; AVX1-NEXT: jne LBB1_7
+; AVX1-NEXT: LBB1_8: ## %else8
+; AVX1-NEXT: testb $16, %al
+; AVX1-NEXT: jne LBB1_9
+; AVX1-NEXT: LBB1_10: ## %else11
+; AVX1-NEXT: testb $32, %al
+; AVX1-NEXT: je LBB1_12
+; AVX1-NEXT: LBB1_11: ## %cond.store13
+; AVX1-NEXT: vmovhps %xmm1, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: LBB1_12: ## %else14
+; AVX1-NEXT: testb $64, %al
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: jne LBB1_13
+; AVX1-NEXT: ## %bb.14: ## %else17
+; AVX1-NEXT: testb %al, %al
+; AVX1-NEXT: js LBB1_15
+; AVX1-NEXT: LBB1_16: ## %else20
+; AVX1-NEXT: testl $256, %eax ## imm = 0x100
+; AVX1-NEXT: jne LBB1_17
+; AVX1-NEXT: LBB1_18: ## %else23
+; AVX1-NEXT: testl $512, %eax ## imm = 0x200
+; AVX1-NEXT: je LBB1_20
+; AVX1-NEXT: LBB1_19: ## %cond.store25
+; AVX1-NEXT: vmovhps %xmm2, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: LBB1_20: ## %else26
+; AVX1-NEXT: testl $1024, %eax ## imm = 0x400
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm0
+; AVX1-NEXT: jne LBB1_21
+; AVX1-NEXT: ## %bb.22: ## %else29
+; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
+; AVX1-NEXT: jne LBB1_23
+; AVX1-NEXT: LBB1_24: ## %else32
+; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
+; AVX1-NEXT: jne LBB1_25
+; AVX1-NEXT: LBB1_26: ## %else35
+; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
+; AVX1-NEXT: je LBB1_28
+; AVX1-NEXT: LBB1_27: ## %cond.store37
+; AVX1-NEXT: vmovhps %xmm3, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: LBB1_28: ## %else38
+; AVX1-NEXT: testl $16384, %eax ## imm = 0x4000
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0
+; AVX1-NEXT: jne LBB1_29
+; AVX1-NEXT: ## %bb.30: ## %else41
+; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
+; AVX1-NEXT: jne LBB1_31
+; AVX1-NEXT: LBB1_32: ## %else44
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+; AVX1-NEXT: LBB1_5: ## %cond.store4
+; AVX1-NEXT: vmovlps %xmm0, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: testb $8, %al
+; AVX1-NEXT: je LBB1_8
+; AVX1-NEXT: LBB1_7: ## %cond.store7
+; AVX1-NEXT: vmovhps %xmm0, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: testb $16, %al
+; AVX1-NEXT: je LBB1_10
+; AVX1-NEXT: LBB1_9: ## %cond.store10
+; AVX1-NEXT: vmovlps %xmm1, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: testb $32, %al
+; AVX1-NEXT: jne LBB1_11
+; AVX1-NEXT: jmp LBB1_12
+; AVX1-NEXT: LBB1_13: ## %cond.store16
+; AVX1-NEXT: vmovlps %xmm0, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: testb %al, %al
+; AVX1-NEXT: jns LBB1_16
+; AVX1-NEXT: LBB1_15: ## %cond.store19
+; AVX1-NEXT: vmovhps %xmm0, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: testl $256, %eax ## imm = 0x100
+; AVX1-NEXT: je LBB1_18
+; AVX1-NEXT: LBB1_17: ## %cond.store22
+; AVX1-NEXT: vmovlps %xmm2, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: testl $512, %eax ## imm = 0x200
+; AVX1-NEXT: jne LBB1_19
+; AVX1-NEXT: jmp LBB1_20
+; AVX1-NEXT: LBB1_21: ## %cond.store28
+; AVX1-NEXT: vmovlps %xmm0, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: testl $2048, %eax ## imm = 0x800
+; AVX1-NEXT: je LBB1_24
+; AVX1-NEXT: LBB1_23: ## %cond.store31
+; AVX1-NEXT: vmovhps %xmm0, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: testl $4096, %eax ## imm = 0x1000
+; AVX1-NEXT: je LBB1_26
+; AVX1-NEXT: LBB1_25: ## %cond.store34
+; AVX1-NEXT: vmovlps %xmm3, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: testl $8192, %eax ## imm = 0x2000
+; AVX1-NEXT: jne LBB1_27
+; AVX1-NEXT: jmp LBB1_28
+; AVX1-NEXT: LBB1_29: ## %cond.store40
+; AVX1-NEXT: vmovlps %xmm0, (%rdi)
+; AVX1-NEXT: addq $8, %rdi
+; AVX1-NEXT: testl $32768, %eax ## imm = 0x8000
+; AVX1-NEXT: je LBB1_32
+; AVX1-NEXT: LBB1_31: ## %cond.store43
+; AVX1-NEXT: vmovhps %xmm0, (%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: compressstore_v16f64_v16i1:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vpsllw $7, %xmm4, %xmm4
+; AVX2-NEXT: vpmovmskb %xmm4, %eax
+; AVX2-NEXT: testb $1, %al
+; AVX2-NEXT: jne LBB1_1
+; AVX2-NEXT: ## %bb.2: ## %else
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: jne LBB1_3
+; AVX2-NEXT: LBB1_4: ## %else2
+; AVX2-NEXT: testb $4, %al
+; AVX2-NEXT: jne LBB1_5
+; AVX2-NEXT: LBB1_6: ## %else5
+; AVX2-NEXT: testb $8, %al
+; AVX2-NEXT: jne LBB1_7
+; AVX2-NEXT: LBB1_8: ## %else8
+; AVX2-NEXT: testb $16, %al
+; AVX2-NEXT: jne LBB1_9
+; AVX2-NEXT: LBB1_10: ## %else11
+; AVX2-NEXT: testb $32, %al
+; AVX2-NEXT: jne LBB1_11
+; AVX2-NEXT: LBB1_12: ## %else14
+; AVX2-NEXT: testb $64, %al
+; AVX2-NEXT: jne LBB1_13
+; AVX2-NEXT: LBB1_14: ## %else17
+; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: js LBB1_15
+; AVX2-NEXT: LBB1_16: ## %else20
+; AVX2-NEXT: testl $256, %eax ## imm = 0x100
+; AVX2-NEXT: jne LBB1_17
+; AVX2-NEXT: LBB1_18: ## %else23
+; AVX2-NEXT: testl $512, %eax ## imm = 0x200
+; AVX2-NEXT: jne LBB1_19
+; AVX2-NEXT: LBB1_20: ## %else26
+; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
+; AVX2-NEXT: jne LBB1_21
+; AVX2-NEXT: LBB1_22: ## %else29
+; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
+; AVX2-NEXT: jne LBB1_23
+; AVX2-NEXT: LBB1_24: ## %else32
+; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
+; AVX2-NEXT: jne LBB1_25
+; AVX2-NEXT: LBB1_26: ## %else35
+; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
+; AVX2-NEXT: jne LBB1_27
+; AVX2-NEXT: LBB1_28: ## %else38
+; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
+; AVX2-NEXT: jne LBB1_29
+; AVX2-NEXT: LBB1_30: ## %else41
+; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
+; AVX2-NEXT: jne LBB1_31
+; AVX2-NEXT: LBB1_32: ## %else44
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+; AVX2-NEXT: LBB1_1: ## %cond.store
+; AVX2-NEXT: vmovlps %xmm0, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testb $2, %al
+; AVX2-NEXT: je LBB1_4
+; AVX2-NEXT: LBB1_3: ## %cond.store1
+; AVX2-NEXT: vmovhps %xmm0, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testb $4, %al
+; AVX2-NEXT: je LBB1_6
+; AVX2-NEXT: LBB1_5: ## %cond.store4
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vmovlps %xmm4, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testb $8, %al
+; AVX2-NEXT: je LBB1_8
+; AVX2-NEXT: LBB1_7: ## %cond.store7
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT: vmovlps %xmm0, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testb $16, %al
+; AVX2-NEXT: je LBB1_10
+; AVX2-NEXT: LBB1_9: ## %cond.store10
+; AVX2-NEXT: vmovlps %xmm1, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testb $32, %al
+; AVX2-NEXT: je LBB1_12
+; AVX2-NEXT: LBB1_11: ## %cond.store13
+; AVX2-NEXT: vmovhps %xmm1, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testb $64, %al
+; AVX2-NEXT: je LBB1_14
+; AVX2-NEXT: LBB1_13: ## %cond.store16
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vmovlps %xmm0, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testb %al, %al
+; AVX2-NEXT: jns LBB1_16
+; AVX2-NEXT: LBB1_15: ## %cond.store19
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[3,2,2,3]
+; AVX2-NEXT: vmovlps %xmm0, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testl $256, %eax ## imm = 0x100
+; AVX2-NEXT: je LBB1_18
+; AVX2-NEXT: LBB1_17: ## %cond.store22
+; AVX2-NEXT: vmovlps %xmm2, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testl $512, %eax ## imm = 0x200
+; AVX2-NEXT: je LBB1_20
+; AVX2-NEXT: LBB1_19: ## %cond.store25
+; AVX2-NEXT: vmovhps %xmm2, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testl $1024, %eax ## imm = 0x400
+; AVX2-NEXT: je LBB1_22
+; AVX2-NEXT: LBB1_21: ## %cond.store28
+; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm0
+; AVX2-NEXT: vmovlps %xmm0, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testl $2048, %eax ## imm = 0x800
+; AVX2-NEXT: je LBB1_24
+; AVX2-NEXT: LBB1_23: ## %cond.store31
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[3,2,2,3]
+; AVX2-NEXT: vmovlps %xmm0, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testl $4096, %eax ## imm = 0x1000
+; AVX2-NEXT: je LBB1_26
+; AVX2-NEXT: LBB1_25: ## %cond.store34
+; AVX2-NEXT: vmovlps %xmm3, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testl $8192, %eax ## imm = 0x2000
+; AVX2-NEXT: je LBB1_28
+; AVX2-NEXT: LBB1_27: ## %cond.store37
+; AVX2-NEXT: vmovhps %xmm3, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testl $16384, %eax ## imm = 0x4000
+; AVX2-NEXT: je LBB1_30
+; AVX2-NEXT: LBB1_29: ## %cond.store40
+; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm0
+; AVX2-NEXT: vmovlps %xmm0, (%rdi)
+; AVX2-NEXT: addq $8, %rdi
+; AVX2-NEXT: testl $32768, %eax ## imm = 0x8000
+; AVX2-NEXT: je LBB1_32
+; AVX2-NEXT: LBB1_31: ## %cond.store43
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm3[3,2,2,3]
+; AVX2-NEXT: vmovlps %xmm0, (%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512F-LABEL: compressstore_v16f64_v16i1:
; AVX512F: ## %bb.0:
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 1289eef7795dc..4cd6bcf6efcb2 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1334,7 +1334,6 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; KNL_64-NEXT: vmovhps %xmm0, (%rcx)
; KNL_64-NEXT: .LBB18_4: # %else2
; KNL_64-NEXT: testb $4, %al
-; KNL_64-NEXT: vextractf128 $1, %ymm0, %xmm0
; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1
; KNL_64-NEXT: jne .LBB18_5
; KNL_64-NEXT: # %bb.6: # %else4
@@ -1344,13 +1343,15 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB18_5: # %cond.store3
+; KNL_64-NEXT: vextractf128 $1, %ymm0, %xmm2
; KNL_64-NEXT: vmovq %xmm1, %rcx
-; KNL_64-NEXT: vmovlps %xmm0, (%rcx)
+; KNL_64-NEXT: vmovlps %xmm2, (%rcx)
; KNL_64-NEXT: testb $8, %al
; KNL_64-NEXT: je .LBB18_8
; KNL_64-NEXT: .LBB18_7: # %cond.store5
+; KNL_64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
-; KNL_64-NEXT: vmovhps %xmm0, (%rax)
+; KNL_64-NEXT: vmovlps %xmm0, (%rax)
; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
@@ -1365,34 +1366,39 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; KNL_32-NEXT: kmovw %k0, %eax
; KNL_32-NEXT: testb $1, %al
-; KNL_32-NEXT: je .LBB18_2
-; KNL_32-NEXT: # %bb.1: # %cond.store
-; KNL_32-NEXT: vmovd %xmm1, %ecx
-; KNL_32-NEXT: vmovlps %xmm0, (%ecx)
-; KNL_32-NEXT: .LBB18_2: # %else
+; KNL_32-NEXT: jne .LBB18_1
+; KNL_32-NEXT: # %bb.2: # %else
; KNL_32-NEXT: testb $2, %al
-; KNL_32-NEXT: je .LBB18_4
-; KNL_32-NEXT: # %bb.3: # %cond.store1
-; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
-; KNL_32-NEXT: vmovhps %xmm0, (%ecx)
+; KNL_32-NEXT: jne .LBB18_3
; KNL_32-NEXT: .LBB18_4: # %else2
; KNL_32-NEXT: testb $4, %al
-; KNL_32-NEXT: vextractf128 $1, %ymm0, %xmm0
; KNL_32-NEXT: jne .LBB18_5
-; KNL_32-NEXT: # %bb.6: # %else4
+; KNL_32-NEXT: .LBB18_6: # %else4
; KNL_32-NEXT: testb $8, %al
; KNL_32-NEXT: jne .LBB18_7
; KNL_32-NEXT: .LBB18_8: # %else6
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
+; KNL_32-NEXT: .LBB18_1: # %cond.store
+; KNL_32-NEXT: vmovd %xmm1, %ecx
+; KNL_32-NEXT: vmovlps %xmm0, (%ecx)
+; KNL_32-NEXT: testb $2, %al
+; KNL_32-NEXT: je .LBB18_4
+; KNL_32-NEXT: .LBB18_3: # %cond.store1
+; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx
+; KNL_32-NEXT: vmovhps %xmm0, (%ecx)
+; KNL_32-NEXT: testb $4, %al
+; KNL_32-NEXT: je .LBB18_6
; KNL_32-NEXT: .LBB18_5: # %cond.store3
+; KNL_32-NEXT: vextractf128 $1, %ymm0, %xmm2
; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx
-; KNL_32-NEXT: vmovlps %xmm0, (%ecx)
+; KNL_32-NEXT: vmovlps %xmm2, (%ecx)
; KNL_32-NEXT: testb $8, %al
; KNL_32-NEXT: je .LBB18_8
; KNL_32-NEXT: .LBB18_7: # %cond.store5
+; KNL_32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
; KNL_32-NEXT: vpextrd $3, %xmm1, %eax
-; KNL_32-NEXT: vmovhps %xmm0, (%eax)
+; KNL_32-NEXT: vmovlps %xmm0, (%eax)
; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index c7320275091c6..271c7d15e4532 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -4892,18 +4892,32 @@ define void @one_mask_bit_set4(ptr %addr, <4 x double> %val) nounwind {
; SSE-NEXT: movhps %xmm1, 24(%rdi)
; SSE-NEXT: retq
;
-; AVX-LABEL: one_mask_bit_set4:
-; AVX: ## %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovhps %xmm0, 24(%rdi)
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-LABEL: one_mask_bit_set4:
+; AVX1: ## %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovhps %xmm0, 24(%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: one_mask_bit_set4:
+; AVX2: ## %bb.0:
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT: vmovlps %xmm0, 24(%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: one_mask_bit_set4:
+; AVX512: ## %bb.0:
+; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX512-NEXT: vmovlps %xmm0, 24(%rdi)
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
;
; X86-AVX512-LABEL: one_mask_bit_set4:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X86-AVX512-NEXT: vmovhps %xmm0, 24(%eax)
+; X86-AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; X86-AVX512-NEXT: vmovlps %xmm0, 24(%eax)
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
call void @llvm.masked.store.v4f64.p0(<4 x double> %val, ptr %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
@@ -4990,8 +5004,7 @@ define void @one_mask_bit_set6(ptr %addr, <16 x i64> %val) nounwind {
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; X86-AVX512-NEXT: vmovlps %xmm0, 48(%eax)
-; X86-AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0
-; X86-AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X86-AVX512-NEXT: vpermpd {{.*#+}} zmm0 = zmm1[3,2,2,3,7,6,6,7]
; X86-AVX512-NEXT: vmovlps %xmm0, 88(%eax)
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
@@ -5934,18 +5947,19 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
; AVX2-NEXT: vpcmpgtd %ymm5, %ymm3, %ymm3
; AVX2-NEXT: vpacksswb %ymm3, %ymm4, %ymm3
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,2,1,3]
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; AVX2-NEXT: vpslld $31, %ymm3, %ymm3
; AVX2-NEXT: vpmaskmovd %ymm0, %ymm3, (%rdx)
+; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
+; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, 32(%rdx)
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm0
+; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
; AVX2-NEXT: vpmaskmovd %ymm2, %ymm0, 64(%rdx)
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpslld $31, %ymm0, %ymm0
-; AVX2-NEXT: vpmaskmovd %ymm1, %ymm0, 32(%rdx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index 9b8816bd11f70..cf6e0f38dd2f2 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -357,14 +357,14 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
;
; AVX512F-LABEL: test_mul3x3_f32:
; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
-; AVX512F-NEXT: vbroadcastss %xmm1, %xmm3
-; AVX512F-NEXT: vmulps %xmm3, %xmm0, %xmm3
+; AVX512F-NEXT: vbroadcastss %xmm1, %xmm2
+; AVX512F-NEXT: vmulps %xmm2, %xmm0, %xmm3
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX512F-NEXT: valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
; AVX512F-NEXT: vmovshdup {{.*#+}} xmm6 = xmm1[1,1,3,3]
; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm4
; AVX512F-NEXT: vaddps %xmm4, %xmm3, %xmm4
-; AVX512F-NEXT: vshufpd {{.*#+}} xmm3 = xmm5[1,0]
+; AVX512F-NEXT: vpermpd {{.*#+}} zmm3 = zmm0[3,3,3,3,7,7,7,7]
; AVX512F-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3]
; AVX512F-NEXT: vshufpd {{.*#+}} xmm8 = xmm1[1,0]
; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2]
@@ -416,38 +416,38 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
;
; AVX512VL-LABEL: test_mul3x3_f32:
; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: valignd {{.*#+}} zmm2 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
-; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm3
-; AVX512VL-NEXT: vmulps %xmm3, %xmm0, %xmm3
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm2
+; AVX512VL-NEXT: vmulps %xmm2, %xmm0, %xmm2
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX512VL-NEXT: valignd {{.*#+}} zmm4 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; AVX512VL-NEXT: vmulps %xmm5, %xmm2, %xmm6
-; AVX512VL-NEXT: vaddps %xmm6, %xmm3, %xmm3
-; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm4[1,0]
+; AVX512VL-NEXT: vmulps %xmm5, %xmm4, %xmm6
+; AVX512VL-NEXT: vaddps %xmm6, %xmm2, %xmm2
+; AVX512VL-NEXT: vpermpd {{.*#+}} zmm6 = zmm0[3,3,3,3,7,7,7,7]
; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3]
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm8 = xmm1[1,0]
; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2]
; AVX512VL-NEXT: vmulps %xmm6, %xmm9, %xmm9
-; AVX512VL-NEXT: vaddps %xmm3, %xmm9, %xmm3
+; AVX512VL-NEXT: vaddps %xmm2, %xmm9, %xmm2
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm9 = xmm0[1,0]
; AVX512VL-NEXT: vmulss %xmm1, %xmm9, %xmm10
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; AVX512VL-NEXT: vmulss %xmm5, %xmm4, %xmm5
+; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX512VL-NEXT: vmulss %xmm5, %xmm3, %xmm5
; AVX512VL-NEXT: vaddss %xmm5, %xmm10, %xmm5
; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm10
; AVX512VL-NEXT: vmulss %xmm8, %xmm10, %xmm8
; AVX512VL-NEXT: vaddss %xmm5, %xmm8, %xmm5
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3]
+; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm5[0],xmm2[3]
; AVX512VL-NEXT: vmulps %xmm7, %xmm0, %xmm5
; AVX512VL-NEXT: vextractf128 $1, %ymm1, %xmm8
; AVX512VL-NEXT: vmovsldup {{.*#+}} xmm11 = xmm8[0,0,2,2]
-; AVX512VL-NEXT: vmulps %xmm2, %xmm11, %xmm11
+; AVX512VL-NEXT: vmulps %xmm4, %xmm11, %xmm11
; AVX512VL-NEXT: vaddps %xmm5, %xmm11, %xmm5
; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm11 = xmm8[1,1,3,3]
; AVX512VL-NEXT: vmulps %xmm6, %xmm11, %xmm12
; AVX512VL-NEXT: vaddps %xmm5, %xmm12, %xmm5
; AVX512VL-NEXT: vmulss %xmm7, %xmm9, %xmm7
-; AVX512VL-NEXT: vmulss %xmm4, %xmm8, %xmm12
+; AVX512VL-NEXT: vmulss %xmm3, %xmm8, %xmm12
; AVX512VL-NEXT: vaddss %xmm7, %xmm12, %xmm7
; AVX512VL-NEXT: vmulss %xmm11, %xmm10, %xmm11
; AVX512VL-NEXT: vaddss %xmm7, %xmm11, %xmm7
@@ -456,19 +456,19 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm11 = xmm8[1,0]
; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2]
; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0
-; AVX512VL-NEXT: vmulps %xmm7, %xmm2, %xmm2
-; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT: vmulps %xmm7, %xmm4, %xmm4
+; AVX512VL-NEXT: vaddps %xmm4, %xmm0, %xmm0
; AVX512VL-NEXT: vextractf32x4 $2, %zmm1, %xmm1
-; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm2
-; AVX512VL-NEXT: vmulps %xmm2, %xmm6, %xmm2
-; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vmulss %xmm11, %xmm9, %xmm2
-; AVX512VL-NEXT: vmulss %xmm7, %xmm4, %xmm4
-; AVX512VL-NEXT: vaddss %xmm4, %xmm2, %xmm2
+; AVX512VL-NEXT: vbroadcastss %xmm1, %xmm4
+; AVX512VL-NEXT: vmulps %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vaddps %xmm4, %xmm0, %xmm0
+; AVX512VL-NEXT: vmulss %xmm11, %xmm9, %xmm4
+; AVX512VL-NEXT: vmulss %xmm7, %xmm3, %xmm3
+; AVX512VL-NEXT: vaddss %xmm3, %xmm4, %xmm3
; AVX512VL-NEXT: vmulss %xmm1, %xmm10, %xmm1
-; AVX512VL-NEXT: vaddss %xmm1, %xmm2, %xmm1
+; AVX512VL-NEXT: vaddss %xmm1, %xmm3, %xmm1
; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm3, %zmm2
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm2, %zmm2
; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,0,0,0,0,0,0,0]
; AVX512VL-NEXT: vpermi2ps %zmm1, %zmm2, %zmm0
; AVX512VL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index 9b08d8baacee1..2ac691d547f31 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1223,17 +1223,17 @@ define dso_local void @zext_v16i8_v16i64(<16 x i8> %x, ptr %y) nounwind "min-leg
; CHECK-LABEL: zext_v16i8_v16i64:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1
-; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
; CHECK-NEXT: vmovdqa %ymm0, (%rdi)
+; CHECK-NEXT: vmovdqa %ymm3, 32(%rdi)
; CHECK-NEXT: vmovdqa %ymm1, 64(%rdi)
-; CHECK-NEXT: vmovdqa %ymm3, 96(%rdi)
-; CHECK-NEXT: vmovdqa %ymm2, 32(%rdi)
+; CHECK-NEXT: vmovdqa %ymm2, 96(%rdi)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%a = zext <16 x i8> %x to <16 x i64>
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 8fd8e0e8120c1..4d0941711ff52 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1464,8 +1464,8 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX1-NEXT: vmovups 32(%rdi), %ymm1
; AVX1-NEXT: vmovups (%rdi), %ymm2
; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX1-NEXT: vmovups 16(%rdi), %xmm4
-; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7]
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],mem[1,0]
; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
@@ -1568,8 +1568,8 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; XOP-NEXT: vmovups 32(%rdi), %ymm1
; XOP-NEXT: vmovups (%rdi), %ymm2
; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; XOP-NEXT: vmovups 16(%rdi), %xmm4
-; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7]
+; XOP-NEXT: vextractf128 $1, %ymm3, %xmm4
+; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],mem[1,0]
; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index c7cc2acaf2627..b8f5802421a7a 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -1123,15 +1123,26 @@ define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
; SSE41-NEXT: movaps %xmm3, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: mul_v4i64_zero_lower:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX2-LABEL: mul_v4i64_zero_lower:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,2,4,6]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: mul_v4i64_zero_lower:
+; AVX512: # %bb.0: # %entry
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512-NEXT: vpsrlq $32, %ymm1, %ymm1
+; AVX512-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vpmovqd %zmm0, %ymm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
entry:
%val1a = zext <4 x i32> %val1 to <4 x i64>
%val2a = and <4 x i64> %val2, <i64 -4294967296, i64 -4294967296, i64 -4294967296, i64 -4294967296>
@@ -1197,9 +1208,9 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
entry:
%val1a = zext <8 x i32> %val1 to <8 x i64>
diff --git a/llvm/test/CodeGen/X86/pr132844.ll b/llvm/test/CodeGen/X86/pr132844.ll
index ded100b2accce..8b744d245bf17 100644
--- a/llvm/test/CodeGen/X86/pr132844.ll
+++ b/llvm/test/CodeGen/X86/pr132844.ll
@@ -4,11 +4,11 @@
define { ptr, i8 } @PR132844(<4 x ptr> %0, <4 x ptr> %1) {
; CHECK-LABEL: PR132844:
; CHECK: # %bb.0:
-; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-NEXT: movb $10, %al
-; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vinserti64x2 $1, 16, %ymm2, %ymm0 {%k1}
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; CHECK-NEXT: vmovdqa %ymm0, %ymm3
+; CHECK-NEXT: vpermt2q 0, %ymm2, %ymm3
+; CHECK-NEXT: vpermt2q %ymm0, %ymm2, %ymm1
+; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm0
; CHECK-NEXT: vmovdqu %ymm0, {{[0-9]+}}(%rsp)
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: xorl %edx, %edx
diff --git a/llvm/test/CodeGen/X86/pr29112.ll b/llvm/test/CodeGen/X86/pr29112.ll
index 2e5c6f047292c..31a1c5a77d88d 100644
--- a/llvm/test/CodeGen/X86/pr29112.ll
+++ b/llvm/test/CodeGen/X86/pr29112.ll
@@ -11,8 +11,6 @@ define <4 x float> @bar(ptr %a1p, ptr %a2p, <4 x float> %a3, <4 x float> %a4, <1
; CHECK-NEXT: subq $136, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 144
; CHECK-NEXT: vmovaps %xmm1, %xmm13
-; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,20,1,17]
-; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm5
; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,1,2,3]
; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,21,1,17,4,21,5,21]
@@ -21,6 +19,8 @@ define <4 x float> @bar(ptr %a1p, ptr %a2p, <4 x float> %a3, <4 x float> %a4, <1
; CHECK-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,20,1,27]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4
+; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,20,1,17]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm5
; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm7 = [5,20,1,19,5,20,5,23]
; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7
; CHECK-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,20,1,19,4,20,5,23]
diff --git a/llvm/test/CodeGen/X86/pr62286.ll b/llvm/test/CodeGen/X86/pr62286.ll
index 2d1b7fcbf0239..ed09d8adf1bc8 100644
--- a/llvm/test/CodeGen/X86/pr62286.ll
+++ b/llvm/test/CodeGen/X86/pr62286.ll
@@ -28,15 +28,16 @@ define i64 @PR62286(i32 %a) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3]
; AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1
+; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR62286:
diff --git a/llvm/test/CodeGen/X86/pr95278.ll b/llvm/test/CodeGen/X86/pr95278.ll
index 104fc04d68cdb..32b507d1a46a2 100644
--- a/llvm/test/CodeGen/X86/pr95278.ll
+++ b/llvm/test/CodeGen/X86/pr95278.ll
@@ -5,8 +5,8 @@ define void @PR95278(ptr %p0, ptr %p1) {
; CHECK-LABEL: PR95278:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtph2ps 2016(%rdi), %zmm0
-; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; CHECK-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: vpmovsxbq {{.*#+}} xmm1 = [7,6]
+; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; CHECK-NEXT: vcvtps2ph $4, %xmm0, %xmm0
; CHECK-NEXT: vpextrw $0, %xmm0, (%rsi)
; CHECK-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/pr97968.ll b/llvm/test/CodeGen/X86/pr97968.ll
index a539a33e9a281..ca5c63cdc1c2e 100644
--- a/llvm/test/CodeGen/X86/pr97968.ll
+++ b/llvm/test/CodeGen/X86/pr97968.ll
@@ -5,8 +5,8 @@ define <2 x i32> @PR97968(<16 x i32> %a0) {
; CHECK-LABEL: PR97968:
; CHECK: # %bb.0:
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,7,2,7]
-; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
%sub0 = shufflevector <16 x i32> %a0, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index f7dd1dc0949f5..91dcba40fcf5a 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -8,19 +8,21 @@
define <8 x i64> @shl_i512_1(<8 x i64> %a) {
; AVX512VL-LABEL: shl_i512_1:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm0[3,4,5,6,7,0,1,2]
+; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512VL-NEXT: vpsllq $1, %xmm0, %xmm3
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
; AVX512VL-NEXT: vpsrlq $63, %xmm4, %xmm4
-; AVX512VL-NEXT: vpaddq %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpor %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT: vpaddq %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT: vpaddq %xmm2, %xmm2, %xmm5
+; AVX512VL-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512VL-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
; AVX512VL-NEXT: vpsrlq $63, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
+; AVX512VL-NEXT: vpaddq %ymm2, %ymm2, %ymm2
+; AVX512VL-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
; AVX512VL-NEXT: vpsrlq $63, %zmm0, %zmm2
; AVX512VL-NEXT: vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
index 26af46263c0e2..5c61622b4f922 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll
@@ -965,24 +965,21 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
;
; AVX512F-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm1
-; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512F-NEXT: vpextrw $6, %xmm2, %eax
-; AVX512F-NEXT: vpextrw $4, %xmm2, %ecx
-; AVX512F-NEXT: vpextrw $2, %xmm2, %edx
-; AVX512F-NEXT: vmovd %xmm2, %esi
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512F-NEXT: vpextrw $6, %xmm2, %edi
-; AVX512F-NEXT: vpextrw $4, %xmm2, %r8d
-; AVX512F-NEXT: vpextrw $2, %xmm2, %r9d
-; AVX512F-NEXT: vmovd %xmm2, %r10d
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vpextrw $6, %xmm0, %r11d
-; AVX512F-NEXT: vpextrw $4, %xmm0, %ebx
-; AVX512F-NEXT: vpextrw $2, %xmm0, %ebp
-; AVX512F-NEXT: vpinsrb $5, %ebp, %xmm1, %xmm0
+; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512F-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512F-NEXT: vpextrw $4, %xmm1, %ecx
+; AVX512F-NEXT: vpextrw $2, %xmm1, %edx
+; AVX512F-NEXT: vmovd %xmm1, %esi
+; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512F-NEXT: vpextrw $6, %xmm1, %edi
+; AVX512F-NEXT: vpextrw $4, %xmm1, %r8d
+; AVX512F-NEXT: vpextrw $2, %xmm1, %r9d
+; AVX512F-NEXT: vmovd %xmm1, %r10d
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT: vpextrw $6, %xmm1, %r11d
+; AVX512F-NEXT: vpextrw $4, %xmm1, %ebx
+; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpinsrb $6, %ebx, %xmm0, %xmm0
; AVX512F-NEXT: vpinsrb $7, %r11d, %xmm0, %xmm0
; AVX512F-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
@@ -994,7 +991,6 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
; AVX512F-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
@@ -1039,24 +1035,21 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
;
; AVX512BW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
; AVX512BW-NEXT: pushq %rbx
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm1
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrw $6, %xmm2, %eax
-; AVX512BW-NEXT: vpextrw $4, %xmm2, %ecx
-; AVX512BW-NEXT: vpextrw $2, %xmm2, %edx
-; AVX512BW-NEXT: vmovd %xmm2, %esi
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vpextrw $6, %xmm2, %edi
-; AVX512BW-NEXT: vpextrw $4, %xmm2, %r8d
-; AVX512BW-NEXT: vpextrw $2, %xmm2, %r9d
-; AVX512BW-NEXT: vmovd %xmm2, %r10d
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vpextrw $6, %xmm0, %r11d
-; AVX512BW-NEXT: vpextrw $4, %xmm0, %ebx
-; AVX512BW-NEXT: vpextrw $2, %xmm0, %ebp
-; AVX512BW-NEXT: vpinsrb $5, %ebp, %xmm1, %xmm0
+; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512BW-NEXT: vpextrw $4, %xmm1, %ecx
+; AVX512BW-NEXT: vpextrw $2, %xmm1, %edx
+; AVX512BW-NEXT: vmovd %xmm1, %esi
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512BW-NEXT: vpextrw $6, %xmm1, %edi
+; AVX512BW-NEXT: vpextrw $4, %xmm1, %r8d
+; AVX512BW-NEXT: vpextrw $2, %xmm1, %r9d
+; AVX512BW-NEXT: vmovd %xmm1, %r10d
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vpextrw $6, %xmm1, %r11d
+; AVX512BW-NEXT: vpextrw $4, %xmm1, %ebx
+; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpinsrb $6, %ebx, %xmm0, %xmm0
; AVX512BW-NEXT: vpinsrb $7, %r11d, %xmm0, %xmm0
; AVX512BW-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
@@ -1068,74 +1061,47 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
; AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX512BW-NEXT: popq %rbx
-; AVX512BW-NEXT: popq %rbp
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
-; AVX512BWVL-ONLY-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512BWVL-ONLY: # %bb.0:
-; AVX512BWVL-ONLY-NEXT: pushq %rbp
-; AVX512BWVL-ONLY-NEXT: pushq %r14
-; AVX512BWVL-ONLY-NEXT: pushq %rbx
-; AVX512BWVL-ONLY-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BWVL-ONLY-NEXT: vpextrw $6, %xmm1, %eax
-; AVX512BWVL-ONLY-NEXT: vpextrw $4, %xmm1, %ecx
-; AVX512BWVL-ONLY-NEXT: vpextrw $2, %xmm1, %edx
-; AVX512BWVL-ONLY-NEXT: vmovd %xmm1, %esi
-; AVX512BWVL-ONLY-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BWVL-ONLY-NEXT: vpextrw $6, %xmm1, %edi
-; AVX512BWVL-ONLY-NEXT: vpextrw $4, %xmm1, %r8d
-; AVX512BWVL-ONLY-NEXT: vpextrw $2, %xmm1, %r9d
-; AVX512BWVL-ONLY-NEXT: vmovd %xmm1, %r10d
-; AVX512BWVL-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-ONLY-NEXT: vpextrw $6, %xmm1, %r11d
-; AVX512BWVL-ONLY-NEXT: vpextrw $4, %xmm1, %ebx
-; AVX512BWVL-ONLY-NEXT: vpextrw $2, %xmm1, %ebp
-; AVX512BWVL-ONLY-NEXT: vmovd %xmm1, %r14d
-; AVX512BWVL-ONLY-NEXT: vpmovdb %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: popq %rbx
-; AVX512BWVL-ONLY-NEXT: popq %r14
-; AVX512BWVL-ONLY-NEXT: popq %rbp
-; AVX512BWVL-ONLY-NEXT: vzeroupper
-; AVX512BWVL-ONLY-NEXT: retq
-;
-; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VBMI-FAST: # %bb.0:
-; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,79]
-; AVX512VBMI-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VBMI-FAST-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
-; AVX512VBMI-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512VBMI-FAST-NEXT: vpextrw $6, %xmm0, %eax
-; AVX512VBMI-FAST-NEXT: vpinsrb $15, %eax, %xmm1, %xmm0
-; AVX512VBMI-FAST-NEXT: vzeroupper
-; AVX512VBMI-FAST-NEXT: retq
-;
-; AVX512VBMI-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VBMI-SLOW: # %bb.0:
-; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,77,78,79]
-; AVX512VBMI-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VBMI-SLOW-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
-; AVX512VBMI-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT: vpextrw $6, %xmm0, %eax
-; AVX512VBMI-SLOW-NEXT: vpextrw $4, %xmm0, %ecx
-; AVX512VBMI-SLOW-NEXT: vpextrw $2, %xmm0, %edx
-; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm1, %xmm0
-; AVX512VBMI-SLOW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT: vzeroupper
-; AVX512VBMI-SLOW-NEXT: retq
+; AVX512BWVL-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: pushq %rbp
+; AVX512BWVL-NEXT: pushq %r14
+; AVX512BWVL-NEXT: pushq %rbx
+; AVX512BWVL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BWVL-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512BWVL-NEXT: vpextrw $4, %xmm1, %ecx
+; AVX512BWVL-NEXT: vpextrw $2, %xmm1, %edx
+; AVX512BWVL-NEXT: vmovd %xmm1, %esi
+; AVX512BWVL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512BWVL-NEXT: vpextrw $6, %xmm1, %edi
+; AVX512BWVL-NEXT: vpextrw $4, %xmm1, %r8d
+; AVX512BWVL-NEXT: vpextrw $2, %xmm1, %r9d
+; AVX512BWVL-NEXT: vmovd %xmm1, %r10d
+; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BWVL-NEXT: vpextrw $6, %xmm1, %r11d
+; AVX512BWVL-NEXT: vpextrw $4, %xmm1, %ebx
+; AVX512BWVL-NEXT: vpextrw $2, %xmm1, %ebp
+; AVX512BWVL-NEXT: vmovd %xmm1, %r14d
+; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpinsrb $5, %ebp, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpinsrb $6, %ebx, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpinsrb $7, %r11d, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512BWVL-NEXT: popq %rbx
+; AVX512BWVL-NEXT: popq %r14
+; AVX512BWVL-NEXT: popq %rbp
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
%n0 = shufflevector <32 x i16> %n2, <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
%n1 = trunc <16 x i16> %n0 to <16 x i8>
ret <16 x i8> %n1
@@ -1304,186 +1270,50 @@ define <16 x i8> @oddelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512F-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512F-NEXT: vpextrw $5, %xmm1, %ecx
-; AVX512F-NEXT: vpextrw $3, %xmm1, %edx
-; AVX512F-NEXT: vpextrw $1, %xmm1, %esi
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512F-NEXT: vpextrw $7, %xmm1, %edi
-; AVX512F-NEXT: vpextrw $5, %xmm1, %r8d
-; AVX512F-NEXT: vpextrw $3, %xmm1, %r9d
-; AVX512F-NEXT: vpextrw $1, %xmm1, %r10d
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpextrw $7, %xmm1, %r11d
-; AVX512F-NEXT: vpextrw $5, %xmm1, %ebx
-; AVX512F-NEXT: vpextrw $3, %xmm1, %ebp
-; AVX512F-NEXT: vpextrw $1, %xmm1, %r14d
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512F-NEXT: vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512F-NEXT: vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512F-NEXT: vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512F-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512F-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512F-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512F-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512F-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512F-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512F-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512F-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %rbp
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rbp
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512VL-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512VL-NEXT: vpextrw $5, %xmm1, %ecx
-; AVX512VL-NEXT: vpextrw $3, %xmm1, %edx
-; AVX512VL-NEXT: vpextrw $1, %xmm1, %esi
-; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VL-NEXT: vpextrw $7, %xmm1, %edi
-; AVX512VL-NEXT: vpextrw $5, %xmm1, %r8d
-; AVX512VL-NEXT: vpextrw $3, %xmm1, %r9d
-; AVX512VL-NEXT: vpextrw $1, %xmm1, %r10d
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpextrw $7, %xmm1, %r11d
-; AVX512VL-NEXT: vpextrw $5, %xmm1, %ebx
-; AVX512VL-NEXT: vpextrw $3, %xmm1, %ebp
-; AVX512VL-NEXT: vpextrw $1, %xmm1, %r14d
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512VL-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %rbp
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: pushq %rbp
-; AVX512BW-NEXT: pushq %r14
-; AVX512BW-NEXT: pushq %rbx
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512BW-NEXT: vpextrw $5, %xmm1, %ecx
-; AVX512BW-NEXT: vpextrw $3, %xmm1, %edx
-; AVX512BW-NEXT: vpextrw $1, %xmm1, %esi
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BW-NEXT: vpextrw $7, %xmm1, %edi
-; AVX512BW-NEXT: vpextrw $5, %xmm1, %r8d
-; AVX512BW-NEXT: vpextrw $3, %xmm1, %r9d
-; AVX512BW-NEXT: vpextrw $1, %xmm1, %r10d
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpextrw $7, %xmm1, %r11d
-; AVX512BW-NEXT: vpextrw $5, %xmm1, %ebx
-; AVX512BW-NEXT: vpextrw $3, %xmm1, %ebp
-; AVX512BW-NEXT: vpextrw $1, %xmm1, %r14d
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512BW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512BW-NEXT: popq %rbx
-; AVX512BW-NEXT: popq %r14
-; AVX512BW-NEXT: popq %rbp
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-ONLY-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512BWVL-ONLY: # %bb.0:
-; AVX512BWVL-ONLY-NEXT: pushq %rbp
-; AVX512BWVL-ONLY-NEXT: pushq %r14
-; AVX512BWVL-ONLY-NEXT: pushq %rbx
-; AVX512BWVL-ONLY-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BWVL-ONLY-NEXT: vpextrw $7, %xmm1, %eax
-; AVX512BWVL-ONLY-NEXT: vpextrw $5, %xmm1, %ecx
-; AVX512BWVL-ONLY-NEXT: vpextrw $3, %xmm1, %edx
-; AVX512BWVL-ONLY-NEXT: vpextrw $1, %xmm1, %esi
-; AVX512BWVL-ONLY-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512BWVL-ONLY-NEXT: vpextrw $7, %xmm1, %edi
-; AVX512BWVL-ONLY-NEXT: vpextrw $5, %xmm1, %r8d
-; AVX512BWVL-ONLY-NEXT: vpextrw $3, %xmm1, %r9d
-; AVX512BWVL-ONLY-NEXT: vpextrw $1, %xmm1, %r10d
-; AVX512BWVL-ONLY-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-ONLY-NEXT: vpextrw $7, %xmm1, %r11d
-; AVX512BWVL-ONLY-NEXT: vpextrw $5, %xmm1, %ebx
-; AVX512BWVL-ONLY-NEXT: vpextrw $3, %xmm1, %ebp
-; AVX512BWVL-ONLY-NEXT: vpextrw $1, %xmm1, %r14d
-; AVX512BWVL-ONLY-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BWVL-ONLY-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $5, %ebp, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $6, %ebx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $7, %r11d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512BWVL-ONLY-NEXT: popq %rbx
-; AVX512BWVL-ONLY-NEXT: popq %r14
-; AVX512BWVL-ONLY-NEXT: popq %rbp
-; AVX512BWVL-ONLY-NEXT: vzeroupper
-; AVX512BWVL-ONLY-NEXT: retq
-;
-; AVX512VBMI-FAST-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VBMI-FAST: # %bb.0:
-; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
-; AVX512VBMI-FAST-NEXT: vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMI-FAST-NEXT: vzeroupper
-; AVX512VBMI-FAST-NEXT: retq
-;
-; AVX512VBMI-SLOW-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
-; AVX512VBMI-SLOW: # %bb.0:
-; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [2,6,10,14,18,22,26,30,34,38,42,46,50,u,u,u]
-; AVX512VBMI-SLOW-NEXT: vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT: vpextrw $7, %xmm0, %eax
-; AVX512VBMI-SLOW-NEXT: vpextrw $5, %xmm0, %ecx
-; AVX512VBMI-SLOW-NEXT: vpextrw $3, %xmm0, %edx
-; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm1, %xmm0
-; AVX512VBMI-SLOW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX512VBMI-SLOW-NEXT: vzeroupper
-; AVX512VBMI-SLOW-NEXT: retq
+; AVX512-LABEL: oddelts_v32i16_trunc_v16i16_to_v16i8:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512-NEXT: vpextrw $7, %xmm1, %eax
+; AVX512-NEXT: vpextrw $5, %xmm1, %ecx
+; AVX512-NEXT: vpextrw $3, %xmm1, %edx
+; AVX512-NEXT: vpextrw $1, %xmm1, %esi
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512-NEXT: vpextrw $7, %xmm1, %edi
+; AVX512-NEXT: vpextrw $5, %xmm1, %r8d
+; AVX512-NEXT: vpextrw $3, %xmm1, %r9d
+; AVX512-NEXT: vpextrw $1, %xmm1, %r10d
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpextrw $7, %xmm1, %r11d
+; AVX512-NEXT: vpextrw $5, %xmm1, %ebx
+; AVX512-NEXT: vpextrw $3, %xmm1, %ebp
+; AVX512-NEXT: vpextrw $1, %xmm1, %r14d
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $5, %ebp, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $6, %ebx, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $7, %r11d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $8, %r10d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $9, %r9d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $10, %r8d, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $11, %edi, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $12, %esi, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $13, %edx, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%n0 = shufflevector <32 x i16> %n2, <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
%n1 = trunc <16 x i16> %n0 to <16 x i8>
ret <16 x i8> %n1
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512BWVL-ONLY: {{.*}}
; AVX512VBMI: {{.*}}
+; AVX512VBMI-FAST: {{.*}}
+; AVX512VBMI-SLOW: {{.*}}
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index ec442c185706c..78d3914caff0d 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -406,26 +406,26 @@ define <4 x double> @PR34175(ptr %p) {
;
; AVX512BW-LABEL: PR34175:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512BW-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [32,1,40,3,48,5,56,7]
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpermt2w (%rdi), %zmm0, %zmm1
+; AVX512BW-NEXT: vcvtdq2pd %xmm1, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: PR34175:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512BWVL-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX512BWVL-NEXT: vpmovsxbw {{.*#+}} xmm0 = [32,1,40,3,48,5,56,7]
+; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BWVL-NEXT: vpermt2w (%rdi), %zmm0, %zmm1
+; AVX512BWVL-NEXT: vcvtdq2pd %xmm1, %ymm0
; AVX512BWVL-NEXT: retq
;
; AVX512VBMI-LABEL: PR34175:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
-; AVX512VBMI-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
+; AVX512VBMI-NEXT: vpmovsxbw {{.*#+}} xmm0 = [32,1,40,3,48,5,56,7]
+; AVX512VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMI-NEXT: vpermt2w (%rdi), %zmm0, %zmm1
+; AVX512VBMI-NEXT: vcvtdq2pd %xmm1, %ymm0
; AVX512VBMI-NEXT: retq
%v = load <32 x i16>, ptr %p, align 2
%shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
diff --git a/llvm/test/CodeGen/X86/trunc-subvector.ll b/llvm/test/CodeGen/X86/trunc-subvector.ll
index 9db2f6ba1e810..9df844214975c 100644
--- a/llvm/test/CodeGen/X86/trunc-subvector.ll
+++ b/llvm/test/CodeGen/X86/trunc-subvector.ll
@@ -88,12 +88,9 @@ define <2 x i32> @test5(<8 x i32> %v) {
; AVX512-LABEL: test5:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rcx
-; AVX512-NEXT: vmovd %eax, %xmm0
-; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [6,8,0,0]
+; AVX512-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x = sext <8 x i32> %v to <8 x i64>
@@ -185,13 +182,9 @@ define <2 x i32> @test10(<8 x i32> %v) {
;
; AVX512-LABEL: test10:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rcx
-; AVX512-NEXT: vmovd %eax, %xmm0
-; AVX512-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512-NEXT: valignd {{.*#+}} zmm0 = zmm0[3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2]
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%x = zext <8 x i32> %v to <8 x i64>
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
index 179e8ad69672b..995e574af44d5 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
@@ -101,9 +101,10 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX512F-32-NEXT: subl $32, %esp
; AVX512F-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512F-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX512F-32-NEXT: vmovhps %xmm0, (%esp)
+; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512F-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX512F-32-NEXT: vmovlps %xmm0, (%esp)
; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512F-32-NEXT: fldl {{[0-9]+}}(%esp)
@@ -131,11 +132,11 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512F-64-NEXT: vcvttsd2si %xmm1, %rax
-; AVX512F-64-NEXT: vmovq %rax, %xmm2
-; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-64-NEXT: vcvttsd2si %xmm1, %rax
; AVX512F-64-NEXT: vmovq %rax, %xmm1
-; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-64-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512F-64-NEXT: vcvttsd2si %xmm2, %rax
+; AVX512F-64-NEXT: vmovq %rax, %xmm2
+; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-64-NEXT: vcvttsd2si %xmm0, %rax
; AVX512F-64-NEXT: vmovq %rax, %xmm2
; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -156,9 +157,10 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX512VL-32-NEXT: subl $32, %esp
; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: vmovhps %xmm0, (%esp)
+; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX512VL-32-NEXT: vmovlps %xmm0, (%esp)
; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
@@ -186,11 +188,11 @@ define <4 x i64> @strict_vector_fptosi_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX512VL-64: # %bb.0:
; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VL-64-NEXT: vcvttsd2si %xmm1, %rax
-; AVX512VL-64-NEXT: vmovq %rax, %xmm2
-; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-64-NEXT: vcvttsd2si %xmm1, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm1
-; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-64-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512VL-64-NEXT: vcvttsd2si %xmm2, %rax
+; AVX512VL-64-NEXT: vmovq %rax, %xmm2
+; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm2
; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -377,16 +379,16 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX512F-32-NEXT: andl $-8, %esp
; AVX512F-32-NEXT: subl $40, %esp
; AVX512F-32-NEXT: .cfi_offset %ebx, -12
-; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512F-32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512F-32-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
; AVX512F-32-NEXT: xorl %eax, %eax
-; AVX512F-32-NEXT: vcomisd %xmm1, %xmm3
+; AVX512F-32-NEXT: vcomisd %xmm1, %xmm2
; AVX512F-32-NEXT: setae %al
; AVX512F-32-NEXT: kmovw %eax, %k1
-; AVX512F-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
-; AVX512F-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3
-; AVX512F-32-NEXT: vmovsd %xmm3, (%esp)
+; AVX512F-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512F-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; AVX512F-32-NEXT: vmovsd %xmm2, (%esp)
+; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512F-32-NEXT: xorl %edx, %edx
; AVX512F-32-NEXT: vcomisd %xmm1, %xmm2
; AVX512F-32-NEXT: setae %dl
@@ -445,11 +447,11 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512F-64-NEXT: vcvttsd2usi %xmm1, %rax
-; AVX512F-64-NEXT: vmovq %rax, %xmm2
-; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-64-NEXT: vcvttsd2usi %xmm1, %rax
; AVX512F-64-NEXT: vmovq %rax, %xmm1
-; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-64-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512F-64-NEXT: vcvttsd2usi %xmm2, %rax
+; AVX512F-64-NEXT: vmovq %rax, %xmm2
+; AVX512F-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-64-NEXT: vcvttsd2usi %xmm0, %rax
; AVX512F-64-NEXT: vmovq %rax, %xmm2
; AVX512F-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -470,16 +472,16 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX512VL-32-NEXT: andl $-8, %esp
; AVX512VL-32-NEXT: subl $40, %esp
; AVX512VL-32-NEXT: .cfi_offset %ebx, -12
-; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512VL-32-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
; AVX512VL-32-NEXT: xorl %eax, %eax
-; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3
+; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2
; AVX512VL-32-NEXT: setae %al
; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
-; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3
-; AVX512VL-32-NEXT: vmovsd %xmm3, (%esp)
+; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT: vmovsd %xmm2, (%esp)
+; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512VL-32-NEXT: xorl %edx, %edx
; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2
; AVX512VL-32-NEXT: setae %dl
@@ -538,11 +540,11 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
; AVX512VL-64: # %bb.0:
; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VL-64-NEXT: vcvttsd2usi %xmm1, %rax
-; AVX512VL-64-NEXT: vmovq %rax, %xmm2
-; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-64-NEXT: vcvttsd2usi %xmm1, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm1
-; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-64-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512VL-64-NEXT: vcvttsd2usi %xmm2, %rax
+; AVX512VL-64-NEXT: vmovq %rax, %xmm2
+; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm2
; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
index ce5db5b246775..f4ed4405b1770 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
@@ -46,9 +46,10 @@ define <8 x i64> @strict_vector_fptosi_v8f64_to_v8i64(<8 x double> %a) #0 {
; AVX512VL-32-NEXT: vmovhps %xmm1, (%esp)
; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,2,2,3,7,6,6,7]
; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: vmovhps %xmm0, {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fisttpll {{[0-9]+}}(%esp)
; AVX512VL-32-NEXT: fldl {{[0-9]+}}(%esp)
@@ -107,12 +108,12 @@ define <8 x i64> @strict_vector_fptosi_v8f64_to_v8i64(<8 x double> %a) #0 {
; AVX512VL-64-NEXT: vmovq %rax, %xmm2
; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX512VL-64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-64-NEXT: vcvttsd2si %xmm2, %rax
-; AVX512VL-64-NEXT: vmovq %rax, %xmm3
-; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512VL-64-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
; AVX512VL-64-NEXT: vcvttsd2si %xmm2, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm2
+; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX512VL-64-NEXT: vcvttsd2si %xmm3, %rax
+; AVX512VL-64-NEXT: vmovq %rax, %xmm3
; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX512VL-64-NEXT: vcvttsd2si %xmm0, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm3
@@ -184,16 +185,16 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2
; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp)
-; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-32-NEXT: vshufpd {{.*#+}} xmm3 = xmm2[1,0]
+; AVX512VL-32-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
; AVX512VL-32-NEXT: xorl %eax, %eax
-; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm3
+; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2
; AVX512VL-32-NEXT: setae %al
; AVX512VL-32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; AVX512VL-32-NEXT: kmovw %eax, %k1
-; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm4 {%k1} {z}
-; AVX512VL-32-NEXT: vsubsd %xmm4, %xmm3, %xmm3
-; AVX512VL-32-NEXT: vmovsd %xmm3, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vmovsd %xmm1, %xmm1, %xmm3 {%k1} {z}
+; AVX512VL-32-NEXT: vsubsd %xmm3, %xmm2, %xmm2
+; AVX512VL-32-NEXT: vmovsd %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512VL-32-NEXT: xorl %ecx, %ecx
; AVX512VL-32-NEXT: vcomisd %xmm1, %xmm2
; AVX512VL-32-NEXT: setae %cl
@@ -296,12 +297,12 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
; AVX512VL-64-NEXT: vmovq %rax, %xmm2
; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX512VL-64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512VL-64-NEXT: vcvttsd2usi %xmm2, %rax
-; AVX512VL-64-NEXT: vmovq %rax, %xmm3
-; AVX512VL-64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512VL-64-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
; AVX512VL-64-NEXT: vcvttsd2usi %xmm2, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm2
+; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX512VL-64-NEXT: vcvttsd2usi %xmm3, %rax
+; AVX512VL-64-NEXT: vmovq %rax, %xmm3
; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX512VL-64-NEXT: vcvttsd2usi %xmm0, %rax
; AVX512VL-64-NEXT: vmovq %rax, %xmm3
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
index a336d0a01fa7b..84ade9e23ce03 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll
@@ -631,40 +631,40 @@ define <4 x double> @uitofp_v4i32_v4f64(<4 x i32> %x) #0 {
}
define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
-; AVX-32-LABEL: sitofp_v4i64_v4f64:
-; AVX-32: # %bb.0:
-; AVX-32-NEXT: pushl %ebp
-; AVX-32-NEXT: .cfi_def_cfa_offset 8
-; AVX-32-NEXT: .cfi_offset %ebp, -8
-; AVX-32-NEXT: movl %esp, %ebp
-; AVX-32-NEXT: .cfi_def_cfa_register %ebp
-; AVX-32-NEXT: andl $-8, %esp
-; AVX-32-NEXT: subl $64, %esp
-; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fstpl (%esp)
-; AVX-32-NEXT: wait
-; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-32-NEXT: movl %ebp, %esp
-; AVX-32-NEXT: popl %ebp
-; AVX-32-NEXT: .cfi_def_cfa %esp, 4
-; AVX-32-NEXT: retl
+; AVX1-32-LABEL: sitofp_v4i64_v4f64:
+; AVX1-32: # %bb.0:
+; AVX1-32-NEXT: pushl %ebp
+; AVX1-32-NEXT: .cfi_def_cfa_offset 8
+; AVX1-32-NEXT: .cfi_offset %ebp, -8
+; AVX1-32-NEXT: movl %esp, %ebp
+; AVX1-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX1-32-NEXT: andl $-8, %esp
+; AVX1-32-NEXT: subl $64, %esp
+; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fstpl (%esp)
+; AVX1-32-NEXT: wait
+; AVX1-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-32-NEXT: movl %ebp, %esp
+; AVX1-32-NEXT: popl %ebp
+; AVX1-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX1-32-NEXT: retl
;
; AVX1-64-LABEL: sitofp_v4i64_v4f64:
; AVX1-64: # %bb.0:
@@ -682,6 +682,41 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
; AVX1-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-64-NEXT: retq
;
+; AVX2-32-LABEL: sitofp_v4i64_v4f64:
+; AVX2-32: # %bb.0:
+; AVX2-32-NEXT: pushl %ebp
+; AVX2-32-NEXT: .cfi_def_cfa_offset 8
+; AVX2-32-NEXT: .cfi_offset %ebp, -8
+; AVX2-32-NEXT: movl %esp, %ebp
+; AVX2-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX2-32-NEXT: andl $-8, %esp
+; AVX2-32-NEXT: subl $64, %esp
+; AVX2-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fstpl (%esp)
+; AVX2-32-NEXT: wait
+; AVX2-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX2-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX2-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-32-NEXT: movl %ebp, %esp
+; AVX2-32-NEXT: popl %ebp
+; AVX2-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX2-32-NEXT: retl
+;
; AVX2-64-LABEL: sitofp_v4i64_v4f64:
; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -698,6 +733,41 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
; AVX2-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-64-NEXT: retq
;
+; AVX512F-32-LABEL: sitofp_v4i64_v4f64:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: pushl %ebp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
+; AVX512F-32-NEXT: .cfi_offset %ebp, -8
+; AVX512F-32-NEXT: movl %esp, %ebp
+; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX512F-32-NEXT: andl $-8, %esp
+; AVX512F-32-NEXT: subl $64, %esp
+; AVX512F-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512F-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512F-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX512F-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fstpl (%esp)
+; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX512F-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-32-NEXT: movl %ebp, %esp
+; AVX512F-32-NEXT: popl %ebp
+; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX512F-32-NEXT: retl
+;
; AVX512F-64-LABEL: sitofp_v4i64_v4f64:
; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -714,6 +784,41 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
; AVX512F-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-64-NEXT: retq
;
+; AVX512VL-32-LABEL: sitofp_v4i64_v4f64:
+; AVX512VL-32: # %bb.0:
+; AVX512VL-32-NEXT: pushl %ebp
+; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8
+; AVX512VL-32-NEXT: .cfi_offset %ebp, -8
+; AVX512VL-32-NEXT: movl %esp, %ebp
+; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX512VL-32-NEXT: andl $-8, %esp
+; AVX512VL-32-NEXT: subl $64, %esp
+; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fstpl (%esp)
+; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512VL-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512VL-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX512VL-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-32-NEXT: movl %ebp, %esp
+; AVX512VL-32-NEXT: popl %ebp
+; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX512VL-32-NEXT: retl
+;
; AVX512VL-64-LABEL: sitofp_v4i64_v4f64:
; AVX512VL-64: # %bb.0:
; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -748,55 +853,55 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 {
}
define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
-; AVX-32-LABEL: uitofp_v4i64_v4f64:
-; AVX-32: # %bb.0:
-; AVX-32-NEXT: pushl %ebp
-; AVX-32-NEXT: .cfi_def_cfa_offset 8
-; AVX-32-NEXT: .cfi_offset %ebp, -8
-; AVX-32-NEXT: movl %esp, %ebp
-; AVX-32-NEXT: .cfi_def_cfa_register %ebp
-; AVX-32-NEXT: andl $-8, %esp
-; AVX-32-NEXT: subl $64, %esp
-; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vextractps $1, %xmm0, %eax
-; AVX-32-NEXT: shrl $31, %eax
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT: fstpl (%esp)
-; AVX-32-NEXT: wait
-; AVX-32-NEXT: vextractps $3, %xmm0, %eax
-; AVX-32-NEXT: shrl $31, %eax
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
-; AVX-32-NEXT: wait
-; AVX-32-NEXT: vextractps $1, %xmm1, %eax
-; AVX-32-NEXT: shrl $31, %eax
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
-; AVX-32-NEXT: wait
-; AVX-32-NEXT: vextractps $3, %xmm1, %eax
-; AVX-32-NEXT: shrl $31, %eax
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp)
-; AVX-32-NEXT: wait
-; AVX-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; AVX-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
-; AVX-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-32-NEXT: movl %ebp, %esp
-; AVX-32-NEXT: popl %ebp
-; AVX-32-NEXT: .cfi_def_cfa %esp, 4
-; AVX-32-NEXT: retl
+; AVX1-32-LABEL: uitofp_v4i64_v4f64:
+; AVX1-32: # %bb.0:
+; AVX1-32-NEXT: pushl %ebp
+; AVX1-32-NEXT: .cfi_def_cfa_offset 8
+; AVX1-32-NEXT: .cfi_offset %ebp, -8
+; AVX1-32-NEXT: movl %esp, %ebp
+; AVX1-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX1-32-NEXT: andl $-8, %esp
+; AVX1-32-NEXT: subl $64, %esp
+; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX1-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vextractps $1, %xmm0, %eax
+; AVX1-32-NEXT: shrl $31, %eax
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT: fstpl (%esp)
+; AVX1-32-NEXT: wait
+; AVX1-32-NEXT: vextractps $3, %xmm0, %eax
+; AVX1-32-NEXT: shrl $31, %eax
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: wait
+; AVX1-32-NEXT: vextractps $1, %xmm1, %eax
+; AVX1-32-NEXT: shrl $31, %eax
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: wait
+; AVX1-32-NEXT: vextractps $3, %xmm1, %eax
+; AVX1-32-NEXT: shrl $31, %eax
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: wait
+; AVX1-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX1-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX1-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-32-NEXT: movl %ebp, %esp
+; AVX1-32-NEXT: popl %ebp
+; AVX1-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX1-32-NEXT: retl
;
; AVX1-64-LABEL: uitofp_v4i64_v4f64:
; AVX1-64: # %bb.0:
@@ -828,6 +933,56 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
; AVX1-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0
; AVX1-64-NEXT: retq
;
+; AVX2-32-LABEL: uitofp_v4i64_v4f64:
+; AVX2-32: # %bb.0:
+; AVX2-32-NEXT: pushl %ebp
+; AVX2-32-NEXT: .cfi_def_cfa_offset 8
+; AVX2-32-NEXT: .cfi_offset %ebp, -8
+; AVX2-32-NEXT: movl %esp, %ebp
+; AVX2-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX2-32-NEXT: andl $-8, %esp
+; AVX2-32-NEXT: subl $64, %esp
+; AVX2-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,3,3,3]
+; AVX2-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vextractps $1, %xmm0, %eax
+; AVX2-32-NEXT: shrl $31, %eax
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT: fstpl (%esp)
+; AVX2-32-NEXT: wait
+; AVX2-32-NEXT: vextractps $3, %xmm0, %eax
+; AVX2-32-NEXT: shrl $31, %eax
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: wait
+; AVX2-32-NEXT: vextractps $1, %xmm1, %eax
+; AVX2-32-NEXT: shrl $31, %eax
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: wait
+; AVX2-32-NEXT: vextractps $3, %xmm1, %eax
+; AVX2-32-NEXT: shrl $31, %eax
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: wait
+; AVX2-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX2-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX2-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-32-NEXT: movl %ebp, %esp
+; AVX2-32-NEXT: popl %ebp
+; AVX2-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX2-32-NEXT: retl
+;
; AVX2-64-LABEL: uitofp_v4i64_v4f64:
; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vextractf128 $1, %ymm0, %xmm1
@@ -859,6 +1014,56 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
; AVX2-64-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; AVX2-64-NEXT: retq
;
+; AVX512F-32-LABEL: uitofp_v4i64_v4f64:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: pushl %ebp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
+; AVX512F-32-NEXT: .cfi_offset %ebp, -8
+; AVX512F-32-NEXT: movl %esp, %ebp
+; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX512F-32-NEXT: andl $-8, %esp
+; AVX512F-32-NEXT: subl $64, %esp
+; AVX512F-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512F-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512F-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,3,3,3]
+; AVX512F-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vextractps $1, %xmm0, %eax
+; AVX512F-32-NEXT: shrl $31, %eax
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT: fstpl (%esp)
+; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: vextractps $3, %xmm0, %eax
+; AVX512F-32-NEXT: shrl $31, %eax
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: vextractps $1, %xmm1, %eax
+; AVX512F-32-NEXT: shrl $31, %eax
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: vextractps $3, %xmm1, %eax
+; AVX512F-32-NEXT: shrl $31, %eax
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX512F-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX512F-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-32-NEXT: movl %ebp, %esp
+; AVX512F-32-NEXT: popl %ebp
+; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX512F-32-NEXT: retl
+;
; AVX512F-64-LABEL: uitofp_v4i64_v4f64:
; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -875,6 +1080,56 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
; AVX512F-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX512F-64-NEXT: retq
;
+; AVX512VL-32-LABEL: uitofp_v4i64_v4f64:
+; AVX512VL-32: # %bb.0:
+; AVX512VL-32-NEXT: pushl %ebp
+; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8
+; AVX512VL-32-NEXT: .cfi_offset %ebp, -8
+; AVX512VL-32-NEXT: movl %esp, %ebp
+; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX512VL-32-NEXT: andl $-8, %esp
+; AVX512VL-32-NEXT: subl $64, %esp
+; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,3,3,3]
+; AVX512VL-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vextractps $1, %xmm0, %eax
+; AVX512VL-32-NEXT: shrl $31, %eax
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT: fstpl (%esp)
+; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: vextractps $3, %xmm0, %eax
+; AVX512VL-32-NEXT: shrl $31, %eax
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: vextractps $1, %xmm1, %eax
+; AVX512VL-32-NEXT: shrl $31, %eax
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: vextractps $3, %xmm1, %eax
+; AVX512VL-32-NEXT: shrl $31, %eax
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT: fstpl {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512VL-32-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
+; AVX512VL-32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512VL-32-NEXT: vmovhps {{.*#+}} xmm1 = xmm1[0,1],mem[0,1]
+; AVX512VL-32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-32-NEXT: movl %ebp, %esp
+; AVX512VL-32-NEXT: popl %ebp
+; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX512VL-32-NEXT: retl
+;
; AVX512VL-64-LABEL: uitofp_v4i64_v4f64:
; AVX512VL-64: # %bb.0:
; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -909,40 +1164,40 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 {
}
define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
-; AVX-32-LABEL: sitofp_v4i64_v4f32:
-; AVX-32: # %bb.0:
-; AVX-32-NEXT: pushl %ebp
-; AVX-32-NEXT: .cfi_def_cfa_offset 8
-; AVX-32-NEXT: .cfi_offset %ebp, -8
-; AVX-32-NEXT: movl %esp, %ebp
-; AVX-32-NEXT: .cfi_def_cfa_register %ebp
-; AVX-32-NEXT: andl $-8, %esp
-; AVX-32-NEXT: subl $48, %esp
-; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fstps {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fstps {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fstps {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fstps (%esp)
-; AVX-32-NEXT: wait
-; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; AVX-32-NEXT: movl %ebp, %esp
-; AVX-32-NEXT: popl %ebp
-; AVX-32-NEXT: .cfi_def_cfa %esp, 4
-; AVX-32-NEXT: vzeroupper
-; AVX-32-NEXT: retl
+; AVX1-32-LABEL: sitofp_v4i64_v4f32:
+; AVX1-32: # %bb.0:
+; AVX1-32-NEXT: pushl %ebp
+; AVX1-32-NEXT: .cfi_def_cfa_offset 8
+; AVX1-32-NEXT: .cfi_offset %ebp, -8
+; AVX1-32-NEXT: movl %esp, %ebp
+; AVX1-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX1-32-NEXT: andl $-8, %esp
+; AVX1-32-NEXT: subl $48, %esp
+; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fstps (%esp)
+; AVX1-32-NEXT: wait
+; AVX1-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX1-32-NEXT: movl %ebp, %esp
+; AVX1-32-NEXT: popl %ebp
+; AVX1-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX1-32-NEXT: vzeroupper
+; AVX1-32-NEXT: retl
;
; AVX1-64-LABEL: sitofp_v4i64_v4f32:
; AVX1-64: # %bb.0:
@@ -961,6 +1216,41 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
; AVX1-64-NEXT: vzeroupper
; AVX1-64-NEXT: retq
;
+; AVX2-32-LABEL: sitofp_v4i64_v4f32:
+; AVX2-32: # %bb.0:
+; AVX2-32-NEXT: pushl %ebp
+; AVX2-32-NEXT: .cfi_def_cfa_offset 8
+; AVX2-32-NEXT: .cfi_offset %ebp, -8
+; AVX2-32-NEXT: movl %esp, %ebp
+; AVX2-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX2-32-NEXT: andl $-8, %esp
+; AVX2-32-NEXT: subl $48, %esp
+; AVX2-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fstps (%esp)
+; AVX2-32-NEXT: wait
+; AVX2-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX2-32-NEXT: movl %ebp, %esp
+; AVX2-32-NEXT: popl %ebp
+; AVX2-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX2-32-NEXT: vzeroupper
+; AVX2-32-NEXT: retl
+;
; AVX2-64-LABEL: sitofp_v4i64_v4f32:
; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax
@@ -978,6 +1268,41 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
; AVX2-64-NEXT: vzeroupper
; AVX2-64-NEXT: retq
;
+; AVX512F-32-LABEL: sitofp_v4i64_v4f32:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: pushl %ebp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
+; AVX512F-32-NEXT: .cfi_offset %ebp, -8
+; AVX512F-32-NEXT: movl %esp, %ebp
+; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX512F-32-NEXT: andl $-8, %esp
+; AVX512F-32-NEXT: subl $48, %esp
+; AVX512F-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512F-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512F-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX512F-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fstps (%esp)
+; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX512F-32-NEXT: movl %ebp, %esp
+; AVX512F-32-NEXT: popl %ebp
+; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX512F-32-NEXT: vzeroupper
+; AVX512F-32-NEXT: retl
+;
; AVX512F-64-LABEL: sitofp_v4i64_v4f32:
; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax
@@ -995,6 +1320,41 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
; AVX512F-64-NEXT: vzeroupper
; AVX512F-64-NEXT: retq
;
+; AVX512VL-32-LABEL: sitofp_v4i64_v4f32:
+; AVX512VL-32: # %bb.0:
+; AVX512VL-32-NEXT: pushl %ebp
+; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8
+; AVX512VL-32-NEXT: .cfi_offset %ebp, -8
+; AVX512VL-32-NEXT: movl %esp, %ebp
+; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX512VL-32-NEXT: andl $-8, %esp
+; AVX512VL-32-NEXT: subl $48, %esp
+; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fstps (%esp)
+; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX512VL-32-NEXT: movl %ebp, %esp
+; AVX512VL-32-NEXT: popl %ebp
+; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX512VL-32-NEXT: vzeroupper
+; AVX512VL-32-NEXT: retl
+;
; AVX512VL-64-LABEL: sitofp_v4i64_v4f32:
; AVX512VL-64: # %bb.0:
; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax
@@ -1032,55 +1392,55 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 {
}
define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
-; AVX-32-LABEL: uitofp_v4i64_v4f32:
-; AVX-32: # %bb.0:
-; AVX-32-NEXT: pushl %ebp
-; AVX-32-NEXT: .cfi_def_cfa_offset 8
-; AVX-32-NEXT: .cfi_offset %ebp, -8
-; AVX-32-NEXT: movl %esp, %ebp
-; AVX-32-NEXT: .cfi_def_cfa_register %ebp
-; AVX-32-NEXT: andl $-8, %esp
-; AVX-32-NEXT: subl $48, %esp
-; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
-; AVX-32-NEXT: vextractps $1, %xmm0, %eax
-; AVX-32-NEXT: shrl $31, %eax
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT: fstps (%esp)
-; AVX-32-NEXT: wait
-; AVX-32-NEXT: vextractps $3, %xmm0, %eax
-; AVX-32-NEXT: shrl $31, %eax
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT: fstps {{[0-9]+}}(%esp)
-; AVX-32-NEXT: wait
-; AVX-32-NEXT: vextractps $1, %xmm1, %eax
-; AVX-32-NEXT: shrl $31, %eax
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT: fstps {{[0-9]+}}(%esp)
-; AVX-32-NEXT: wait
-; AVX-32-NEXT: vextractps $3, %xmm1, %eax
-; AVX-32-NEXT: shrl $31, %eax
-; AVX-32-NEXT: fildll {{[0-9]+}}(%esp)
-; AVX-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
-; AVX-32-NEXT: fstps {{[0-9]+}}(%esp)
-; AVX-32-NEXT: wait
-; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; AVX-32-NEXT: movl %ebp, %esp
-; AVX-32-NEXT: popl %ebp
-; AVX-32-NEXT: .cfi_def_cfa %esp, 4
-; AVX-32-NEXT: vzeroupper
-; AVX-32-NEXT: retl
+; AVX1-32-LABEL: uitofp_v4i64_v4f32:
+; AVX1-32: # %bb.0:
+; AVX1-32-NEXT: pushl %ebp
+; AVX1-32-NEXT: .cfi_def_cfa_offset 8
+; AVX1-32-NEXT: .cfi_offset %ebp, -8
+; AVX1-32-NEXT: movl %esp, %ebp
+; AVX1-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX1-32-NEXT: andl $-8, %esp
+; AVX1-32-NEXT: subl $48, %esp
+; AVX1-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX1-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: vextractps $1, %xmm0, %eax
+; AVX1-32-NEXT: shrl $31, %eax
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT: fstps (%esp)
+; AVX1-32-NEXT: wait
+; AVX1-32-NEXT: vextractps $3, %xmm0, %eax
+; AVX1-32-NEXT: shrl $31, %eax
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: wait
+; AVX1-32-NEXT: vextractps $1, %xmm1, %eax
+; AVX1-32-NEXT: shrl $31, %eax
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: wait
+; AVX1-32-NEXT: vextractps $3, %xmm1, %eax
+; AVX1-32-NEXT: shrl $31, %eax
+; AVX1-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX1-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX1-32-NEXT: wait
+; AVX1-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX1-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX1-32-NEXT: movl %ebp, %esp
+; AVX1-32-NEXT: popl %ebp
+; AVX1-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX1-32-NEXT: vzeroupper
+; AVX1-32-NEXT: retl
;
; AVX1-64-LABEL: uitofp_v4i64_v4f32:
; AVX1-64: # %bb.0:
@@ -1109,6 +1469,56 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
; AVX1-64-NEXT: vzeroupper
; AVX1-64-NEXT: retq
;
+; AVX2-32-LABEL: uitofp_v4i64_v4f32:
+; AVX2-32: # %bb.0:
+; AVX2-32-NEXT: pushl %ebp
+; AVX2-32-NEXT: .cfi_def_cfa_offset 8
+; AVX2-32-NEXT: .cfi_offset %ebp, -8
+; AVX2-32-NEXT: movl %esp, %ebp
+; AVX2-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX2-32-NEXT: andl $-8, %esp
+; AVX2-32-NEXT: subl $48, %esp
+; AVX2-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,3,3,3]
+; AVX2-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: vextractps $1, %xmm0, %eax
+; AVX2-32-NEXT: shrl $31, %eax
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT: fstps (%esp)
+; AVX2-32-NEXT: wait
+; AVX2-32-NEXT: vextractps $3, %xmm0, %eax
+; AVX2-32-NEXT: shrl $31, %eax
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: wait
+; AVX2-32-NEXT: vextractps $1, %xmm1, %eax
+; AVX2-32-NEXT: shrl $31, %eax
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: wait
+; AVX2-32-NEXT: vextractps $3, %xmm1, %eax
+; AVX2-32-NEXT: shrl $31, %eax
+; AVX2-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX2-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX2-32-NEXT: wait
+; AVX2-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX2-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX2-32-NEXT: movl %ebp, %esp
+; AVX2-32-NEXT: popl %ebp
+; AVX2-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX2-32-NEXT: vzeroupper
+; AVX2-32-NEXT: retl
+;
; AVX2-64-LABEL: uitofp_v4i64_v4f32:
; AVX2-64: # %bb.0:
; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
@@ -1135,6 +1545,56 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
; AVX2-64-NEXT: vzeroupper
; AVX2-64-NEXT: retq
;
+; AVX512F-32-LABEL: uitofp_v4i64_v4f32:
+; AVX512F-32: # %bb.0:
+; AVX512F-32-NEXT: pushl %ebp
+; AVX512F-32-NEXT: .cfi_def_cfa_offset 8
+; AVX512F-32-NEXT: .cfi_offset %ebp, -8
+; AVX512F-32-NEXT: movl %esp, %ebp
+; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX512F-32-NEXT: andl $-8, %esp
+; AVX512F-32-NEXT: subl $48, %esp
+; AVX512F-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512F-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512F-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,3,3,3]
+; AVX512F-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: vextractps $1, %xmm0, %eax
+; AVX512F-32-NEXT: shrl $31, %eax
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT: fstps (%esp)
+; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: vextractps $3, %xmm0, %eax
+; AVX512F-32-NEXT: shrl $31, %eax
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: vextractps $1, %xmm1, %eax
+; AVX512F-32-NEXT: shrl $31, %eax
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: vextractps $3, %xmm1, %eax
+; AVX512F-32-NEXT: shrl $31, %eax
+; AVX512F-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512F-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX512F-32-NEXT: wait
+; AVX512F-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX512F-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX512F-32-NEXT: movl %ebp, %esp
+; AVX512F-32-NEXT: popl %ebp
+; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX512F-32-NEXT: vzeroupper
+; AVX512F-32-NEXT: retl
+;
; AVX512F-64-LABEL: uitofp_v4i64_v4f32:
; AVX512F-64: # %bb.0:
; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax
@@ -1152,6 +1612,56 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
; AVX512F-64-NEXT: vzeroupper
; AVX512F-64-NEXT: retq
;
+; AVX512VL-32-LABEL: uitofp_v4i64_v4f32:
+; AVX512VL-32: # %bb.0:
+; AVX512VL-32-NEXT: pushl %ebp
+; AVX512VL-32-NEXT: .cfi_def_cfa_offset 8
+; AVX512VL-32-NEXT: .cfi_offset %ebp, -8
+; AVX512VL-32-NEXT: movl %esp, %ebp
+; AVX512VL-32-NEXT: .cfi_def_cfa_register %ebp
+; AVX512VL-32-NEXT: andl $-8, %esp
+; AVX512VL-32-NEXT: subl $48, %esp
+; AVX512VL-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,3,3,3]
+; AVX512VL-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: vextractps $1, %xmm0, %eax
+; AVX512VL-32-NEXT: shrl $31, %eax
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT: fstps (%esp)
+; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: vextractps $3, %xmm0, %eax
+; AVX512VL-32-NEXT: shrl $31, %eax
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: vextractps $1, %xmm1, %eax
+; AVX512VL-32-NEXT: shrl $31, %eax
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: vextractps $3, %xmm1, %eax
+; AVX512VL-32-NEXT: shrl $31, %eax
+; AVX512VL-32-NEXT: fildll {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
+; AVX512VL-32-NEXT: fstps {{[0-9]+}}(%esp)
+; AVX512VL-32-NEXT: wait
+; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX512VL-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; AVX512VL-32-NEXT: movl %ebp, %esp
+; AVX512VL-32-NEXT: popl %ebp
+; AVX512VL-32-NEXT: .cfi_def_cfa %esp, 4
+; AVX512VL-32-NEXT: vzeroupper
+; AVX512VL-32-NEXT: retl
+;
; AVX512VL-64-LABEL: uitofp_v4i64_v4f32:
; AVX512VL-64: # %bb.0:
; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax
@@ -1189,3 +1699,5 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 {
}
attributes #0 = { strictfp }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX-32: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
index 0cf945202a2d4..0cdd2821f92b3 100644
--- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll
@@ -271,21 +271,21 @@ define <8 x double> @sitofp_v8i64_v8f64(<8 x i64> %x) #0 {
; NODQ-32-NEXT: .cfi_def_cfa_register %ebp
; NODQ-32-NEXT: andl $-8, %esp
; NODQ-32-NEXT: subl $128, %esp
-; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm1
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm0
-; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: valignq {{.*#+}} zmm1 = zmm0[5,6,7,0,1,2,3,4]
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: valignq {{.*#+}} zmm1 = zmm0[7,0,1,2,3,4,5,6]
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpermq {{.*#+}} zmm0 = zmm0[3,3,3,3,7,7,7,7]
+; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
@@ -369,64 +369,64 @@ define <8 x double> @uitofp_v8i64_v8f64(<8 x i64> %x) #0 {
; NODQ-32-NEXT: .cfi_def_cfa_register %ebp
; NODQ-32-NEXT: andl $-8, %esp
; NODQ-32-NEXT: subl $128, %esp
-; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm3
-; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm2
-; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm1
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vextractps $1, %xmm3, %eax
+; NODQ-32-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; NODQ-32-NEXT: vmovq %xmm3, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: valignq {{.*#+}} zmm1 = zmm0[5,6,7,0,1,2,3,4]
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; NODQ-32-NEXT: vmovq %xmm2, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: valignq {{.*#+}} zmm1 = zmm0[7,0,1,2,3,4,5,6]
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpermq {{.*#+}} zmm4 = zmm0[3,3,3,3,7,7,7,7]
+; NODQ-32-NEXT: vmovq %xmm4, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpextrd $1, %xmm3, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $3, %xmm3, %eax
+; NODQ-32-NEXT: vpextrd $3, %xmm3, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $1, %xmm2, %eax
+; NODQ-32-NEXT: vpextrd $1, %xmm2, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $3, %xmm2, %eax
+; NODQ-32-NEXT: vpextrd $3, %xmm2, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $1, %xmm0, %eax
+; NODQ-32-NEXT: vpextrd $1, %xmm0, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstpl (%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $3, %xmm0, %eax
+; NODQ-32-NEXT: vpextrd $3, %xmm0, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $1, %xmm1, %eax
+; NODQ-32-NEXT: vpextrd $1, %xmm1, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $3, %xmm1, %eax
+; NODQ-32-NEXT: vpextrd $3, %xmm1, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
@@ -498,21 +498,21 @@ define <8 x float> @sitofp_v8i64_v8f32(<8 x i64> %x) #0 {
; NODQ-32-NEXT: .cfi_def_cfa_register %ebp
; NODQ-32-NEXT: andl $-8, %esp
; NODQ-32-NEXT: subl $96, %esp
-; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm1
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm0
-; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextracti128 $1, %ymm0, %xmm1
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpermq {{.*#+}} zmm1 = zmm0[3,3,3,3,7,7,7,7]
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: valignq {{.*#+}} zmm1 = zmm0[5,6,7,0,1,2,3,4]
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: valignq {{.*#+}} zmm0 = zmm0[7,0,1,2,3,4,5,6]
+; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
@@ -594,64 +594,64 @@ define <8 x float> @uitofp_v8i64_v8f32(<8 x i64> %x) #0 {
; NODQ-32-NEXT: .cfi_def_cfa_register %ebp
; NODQ-32-NEXT: andl $-8, %esp
; NODQ-32-NEXT: subl $96, %esp
-; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm3
-; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm2
-; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm1
-; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp)
-; NODQ-32-NEXT: vextractps $1, %xmm0, %eax
+; NODQ-32-NEXT: vmovq %xmm0, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextracti128 $1, %ymm0, %xmm3
+; NODQ-32-NEXT: vmovq %xmm3, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpermq {{.*#+}} zmm1 = zmm0[3,3,3,3,7,7,7,7]
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; NODQ-32-NEXT: vmovq %xmm2, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: valignq {{.*#+}} zmm1 = zmm0[5,6,7,0,1,2,3,4]
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; NODQ-32-NEXT: vmovq %xmm1, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: valignq {{.*#+}} zmm4 = zmm0[7,0,1,2,3,4,5,6]
+; NODQ-32-NEXT: vmovq %xmm4, {{[0-9]+}}(%esp)
+; NODQ-32-NEXT: vpextrd $1, %xmm0, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstps (%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $3, %xmm0, %eax
+; NODQ-32-NEXT: vpextrd $3, %xmm0, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $1, %xmm3, %eax
+; NODQ-32-NEXT: vpextrd $1, %xmm3, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $3, %xmm3, %eax
+; NODQ-32-NEXT: vpextrd $3, %xmm3, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $1, %xmm2, %eax
+; NODQ-32-NEXT: vpextrd $1, %xmm2, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $3, %xmm2, %eax
+; NODQ-32-NEXT: vpextrd $3, %xmm2, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $1, %xmm1, %eax
+; NODQ-32-NEXT: vpextrd $1, %xmm1, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp)
; NODQ-32-NEXT: wait
-; NODQ-32-NEXT: vextractps $3, %xmm1, %eax
+; NODQ-32-NEXT: vpextrd $3, %xmm1, %eax
; NODQ-32-NEXT: shrl $31, %eax
; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp)
; NODQ-32-NEXT: fadds {{\.?LCPI[0-9]+_[0-9]+}}(,%eax,4)
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index df2dc77dc1259..64acfd4b1b39f 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -161,11 +161,11 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
; AVX2: # %bb.0:
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vcvttsd2si %xmm1, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX2-NEXT: vcvttsd2si %xmm1, %rax
; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX2-NEXT: vcvttsd2si %xmm2, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vcvttsd2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -179,11 +179,11 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vcvttsd2si %xmm1, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm2
-; AVX512F-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT: vcvttsd2si %xmm1, %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512F-NEXT: vcvttsd2si %xmm2, %rax
+; AVX512F-NEXT: vmovq %rax, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: vcvttsd2si %xmm0, %rax
; AVX512F-NEXT: vmovq %rax, %xmm2
; AVX512F-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -197,11 +197,11 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm2
-; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512VL-NEXT: vcvttsd2si %xmm2, %rax
+; AVX512VL-NEXT: vmovq %rax, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm2
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -606,17 +606,17 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; AVX2-NEXT: sarq $63, %rdx
; AVX2-NEXT: andq %rax, %rdx
; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm3
-; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm4
+; AVX2-NEXT: vmovq %rdx, %xmm2
+; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[3,2,2,3]
+; AVX2-NEXT: vsubsd %xmm1, %xmm3, %xmm4
; AVX2-NEXT: vcvttsd2si %xmm4, %rax
-; AVX2-NEXT: vcvttsd2si %xmm2, %rcx
+; AVX2-NEXT: vcvttsd2si %xmm3, %rcx
; AVX2-NEXT: movq %rcx, %rdx
; AVX2-NEXT: sarq $63, %rdx
; AVX2-NEXT: andq %rax, %rdx
; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT: vmovq %rdx, %xmm3
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm3
; AVX2-NEXT: vcvttsd2si %xmm3, %rax
; AVX2-NEXT: vcvttsd2si %xmm0, %rcx
@@ -642,11 +642,11 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; AVX512F: # %bb.0:
; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm2
-; AVX512F-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax
; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512F-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512F-NEXT: vcvttsd2usi %xmm2, %rax
+; AVX512F-NEXT: vmovq %rax, %xmm2
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax
; AVX512F-NEXT: vmovq %rax, %xmm2
; AVX512F-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -660,11 +660,11 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm2
-; AVX512VL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512VL-NEXT: vcvttsd2usi %xmm2, %rax
+; AVX512VL-NEXT: vmovq %rax, %xmm2
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax
; AVX512VL-NEXT: vmovq %rax, %xmm2
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll
index e88387a8b7c69..3161ef4318c49 100644
--- a/llvm/test/CodeGen/X86/vector-compress.ll
+++ b/llvm/test/CodeGen/X86/vector-compress.ll
@@ -531,34 +531,35 @@ define <4 x double> @test_compress_v4f64(<4 x double> %vec, <4 x i1> %mask, <4 x
; AVX2-NEXT: addl %eax, %ecx
; AVX2-NEXT: andl $3, %ecx
; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vmovlpd %xmm0, (%rsp)
+; AVX2-NEXT: vmovlps %xmm0, (%rsp)
; AVX2-NEXT: vmovq %xmm3, %rax
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: vmovhpd %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT: vmovhps %xmm0, (%rsp,%rcx,8)
; AVX2-NEXT: vpextrq $1, %xmm3, %rcx
; AVX2-NEXT: subq %rcx, %rax
; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovlpd %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vmovlps %xmm2, (%rsp,%rcx,8)
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2
; AVX2-NEXT: vmovq %xmm2, %rcx
; AVX2-NEXT: subq %rcx, %rax
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: andl $3, %ecx
-; AVX2-NEXT: vmovhpd %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT: vmovlps %xmm0, (%rsp,%rcx,8)
; AVX2-NEXT: vpextrq $1, %xmm2, %rcx
; AVX2-NEXT: subq %rcx, %rax
; AVX2-NEXT: cmpq $4, %rax
-; AVX2-NEXT: jb .LBB7_2
+; AVX2-NEXT: jae .LBB7_2
; AVX2-NEXT: # %bb.1:
-; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT: vmovaps %xmm1, %xmm0
; AVX2-NEXT: .LBB7_2:
; AVX2-NEXT: cmpq $3, %rax
; AVX2-NEXT: movl $3, %ecx
; AVX2-NEXT: cmovbq %rax, %rcx
; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: vmovsd %xmm1, (%rsp,%rax,8)
+; AVX2-NEXT: vmovsd %xmm0, (%rsp,%rax,8)
; AVX2-NEXT: vmovaps (%rsp), %ymm0
; AVX2-NEXT: movq %rbp, %rsp
; AVX2-NEXT: popq %rbp
@@ -1012,12 +1013,13 @@ define <8 x double> @test_compress_v8f64(<8 x double> %vec, <8 x i1> %mask, <8 x
; AVX2-NEXT: vpextrw $1, %xmm2, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovlps %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX2-NEXT: vmovlps %xmm4, (%rsp,%rcx,8)
; AVX2-NEXT: vpextrw $2, %xmm2, %eax
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: addq %rcx, %rax
-; AVX2-NEXT: vmovhps %xmm0, (%rsp,%rax,8)
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT: vmovlps %xmm0, (%rsp,%rax,8)
; AVX2-NEXT: vpextrw $3, %xmm2, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
@@ -1026,36 +1028,37 @@ define <8 x double> @test_compress_v8f64(<8 x double> %vec, <8 x i1> %mask, <8 x
; AVX2-NEXT: addq %rcx, %rax
; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $7, %ecx
-; AVX2-NEXT: vmovlpd %xmm1, (%rsp,%rcx,8)
+; AVX2-NEXT: vmovlps %xmm1, (%rsp,%rcx,8)
; AVX2-NEXT: vpextrw $5, %xmm2, %ecx
; AVX2-NEXT: andl $1, %ecx
; AVX2-NEXT: addq %rax, %rcx
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax def $rax
; AVX2-NEXT: andl $7, %eax
-; AVX2-NEXT: vmovhpd %xmm1, (%rsp,%rax,8)
+; AVX2-NEXT: vmovhps %xmm1, (%rsp,%rax,8)
; AVX2-NEXT: vpextrw $6, %xmm2, %edx
; AVX2-NEXT: andl $1, %edx
; AVX2-NEXT: addq %rcx, %rdx
; AVX2-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX2-NEXT: andl $7, %ecx
; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0
-; AVX2-NEXT: vmovlpd %xmm0, (%rsp,%rcx,8)
+; AVX2-NEXT: vmovlps %xmm0, (%rsp,%rcx,8)
; AVX2-NEXT: vpextrw $7, %xmm2, %eax
; AVX2-NEXT: andl $1, %eax
; AVX2-NEXT: addq %rdx, %rax
; AVX2-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
; AVX2-NEXT: andl $7, %edx
-; AVX2-NEXT: vmovhpd %xmm0, (%rsp,%rdx,8)
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[3,2,2,3]
+; AVX2-NEXT: vmovlps %xmm0, (%rsp,%rdx,8)
; AVX2-NEXT: cmpq $8, %rax
-; AVX2-NEXT: jb .LBB11_2
+; AVX2-NEXT: jae .LBB11_2
; AVX2-NEXT: # %bb.1:
-; AVX2-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
+; AVX2-NEXT: vmovaps %xmm3, %xmm0
; AVX2-NEXT: .LBB11_2:
; AVX2-NEXT: cmpq $7, %rax
; AVX2-NEXT: movl $7, %ecx
; AVX2-NEXT: cmovbq %rax, %rcx
; AVX2-NEXT: movl %ecx, %eax
-; AVX2-NEXT: vmovsd %xmm3, (%rsp,%rax,8)
+; AVX2-NEXT: vmovsd %xmm0, (%rsp,%rax,8)
; AVX2-NEXT: vmovaps (%rsp), %ymm0
; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1
; AVX2-NEXT: movq %rbp, %rsp
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index bdbe3c09e5782..c2142931f02b7 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -3182,26 +3182,27 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; AVX2-LABEL: cvt_4f64_to_4i16:
; AVX2: # %bb.0:
; AVX2-NEXT: subq $88, %rsp
-; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
-; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
-; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -3245,27 +3246,28 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
; AVX512-NEXT: subq $72, %rsp
; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = mem[1,0]
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2 at PLT
@@ -3314,26 +3316,27 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX2-LABEL: cvt_4f64_to_8i16_undef:
; AVX2: # %bb.0:
; AVX2-NEXT: subq $88, %rsp
-; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
-; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
-; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -3377,27 +3380,28 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
; AVX512-NEXT: subq $72, %rsp
; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = mem[1,0]
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2 at PLT
@@ -3447,26 +3451,27 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX2-LABEL: cvt_4f64_to_8i16_zero:
; AVX2: # %bb.0:
; AVX2-NEXT: subq $88, %rsp
-; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
-; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
-; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -3510,27 +3515,28 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
; AVX512-NEXT: subq $72, %rsp
; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = mem[1,0]
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vinsertps $28, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero
; AVX512-NEXT: addq $72, %rsp
; AVX512-NEXT: retq
@@ -3541,63 +3547,123 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
}
define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
-; AVX-LABEL: cvt_8f64_to_8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: subq $104, %rsp
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: addq $104, %rsp
-; AVX-NEXT: retq
+; AVX1-LABEL: cvt_8f64_to_8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: subq $104, %rsp
+; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = mem[1,0]
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = mem[1,0]
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = mem[1,0]
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = mem[1,0]
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT: addq $104, %rsp
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: cvt_8f64_to_8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: subq $104, %rsp
+; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm0 = mem[1,0]
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm0 = mem[1,0]
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX2-NEXT: addq $104, %rsp
+; AVX2-NEXT: retq
;
; F16C-LABEL: cvt_8f64_to_8i16:
; F16C: # %bb.0:
@@ -3686,20 +3752,21 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7]
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3708,8 +3775,8 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
; AVX512-NEXT: callq __truncdfhf2 at PLT
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX512-NEXT: addq $120, %rsp
@@ -3865,24 +3932,25 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind {
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: subq $80, %rsp
; AVX2-NEXT: movq %rdi, %rbx
-; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
-; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
; AVX2-NEXT: vpextrw $0, %xmm0, 4(%rbx)
; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
@@ -3936,27 +4004,28 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, ptr %a1) nounwind {
; AVX512-NEXT: movq %rdi, %rbx
; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = mem[1,0]
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512-NEXT: vmovq %xmm0, (%rbx)
; AVX512-NEXT: addq $64, %rsp
@@ -4009,26 +4078,27 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind {
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: subq $80, %rsp
; AVX2-NEXT: movq %rdi, %rbx
-; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
-; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
-; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -4080,27 +4150,28 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, ptr %a1) nounwind {
; AVX512-NEXT: movq %rdi, %rbx
; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = mem[1,0]
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: callq __truncdfhf2 at PLT
@@ -4159,26 +4230,27 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind {
; AVX2-NEXT: pushq %rbx
; AVX2-NEXT: subq $80, %rsp
; AVX2-NEXT: movq %rdi, %rbx
-; AVX2-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill
-; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
-; AVX2-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: callq __truncdfhf2 at PLT
-; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -4230,27 +4302,28 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind {
; AVX512-NEXT: movq %rdi, %rbx
; AVX512-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = mem[1,0]
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vinsertps $28, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vinsertps $28, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],zero,zero
; AVX512-NEXT: vmovaps %xmm0, (%rbx)
; AVX512-NEXT: addq $64, %rsp
@@ -4264,67 +4337,131 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, ptr %a1) nounwind {
}
define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind {
-; AVX-LABEL: store_cvt_8f64_to_8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbx
-; AVX-NEXT: subq $96, %rsp
-; AVX-NEXT: movq %rdi, %rbx
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[1,0]
-; AVX-NEXT: callq __truncdfhf2 at PLT
-; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: vmovdqa %xmm0, (%rbx)
-; AVX-NEXT: addq $96, %rsp
-; AVX-NEXT: popq %rbx
-; AVX-NEXT: retq
+; AVX1-LABEL: store_cvt_8f64_to_8i16:
+; AVX1: # %bb.0:
+; AVX1-NEXT: pushq %rbx
+; AVX1-NEXT: subq $96, %rsp
+; AVX1-NEXT: movq %rdi, %rbx
+; AVX1-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = mem[1,0]
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = mem[1,0]
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = mem[1,0]
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = mem[1,0]
+; AVX1-NEXT: callq __truncdfhf2 at PLT
+; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX1-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX1-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX1-NEXT: vmovdqa %xmm0, (%rbx)
+; AVX1-NEXT: addq $96, %rsp
+; AVX1-NEXT: popq %rbx
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_cvt_8f64_to_8i16:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: subq $96, %rsp
+; AVX2-NEXT: movq %rdi, %rbx
+; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm0 = mem[1,0]
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm0 = mem[3,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm0 = mem[1,0]
+; AVX2-NEXT: callq __truncdfhf2 at PLT
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX2-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm0 = xmm0[0],mem[0]
+; AVX2-NEXT: vmovdqa %xmm0, (%rbx)
+; AVX2-NEXT: addq $96, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: retq
;
; F16C-LABEL: store_cvt_8f64_to_8i16:
; F16C: # %bb.0:
@@ -4419,20 +4556,21 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind {
; AVX512-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vpermpd $235, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm0 = mem[3,2,2,3,7,6,6,7]
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
; AVX512-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm0 = mem[1,0]
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
; AVX512-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: callq __truncdfhf2 at PLT
; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4441,8 +4579,8 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, ptr %a1) nounwind {
; AVX512-NEXT: callq __truncdfhf2 at PLT
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0],mem[0]
; AVX512-NEXT: vmovdqa %xmm0, (%rbx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index a39bc6b668669..9704bf59fdaa2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -845,43 +845,43 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX2-LABEL: load_i16_stride3_vf16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-NEXT: vmovdqa 80(%rdi), %xmm3
-; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: vmovdqa (%rdi), %ymm2
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX2-NEXT: vmovdqa 64(%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 80(%rdi), %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm5
+; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm6
+; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX2-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT: vmovdqa %ymm2, (%rsi)
+; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT: vmovdqa %ymm4, (%rsi)
; AVX2-NEXT: vmovdqa %ymm5, (%rdx)
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
; AVX2-NEXT: vzeroupper
@@ -889,43 +889,43 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX2-FP-LABEL: load_i16_stride3_vf16:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm3
-; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2
+; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm0
+; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm1
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm5
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm6
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT: vmovdqa %ymm2, (%rsi)
+; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FP-NEXT: vmovdqa %ymm4, (%rsi)
; AVX2-FP-NEXT: vmovdqa %ymm5, (%rdx)
; AVX2-FP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX2-FP-NEXT: vzeroupper
@@ -933,43 +933,43 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX2-FCP-LABEL: load_i16_stride3_vf16:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm0
+; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm5
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm5
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm6
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rsi)
+; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rsi)
; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rdx)
; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX2-FCP-NEXT: vzeroupper
@@ -977,176 +977,172 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-LABEL: load_i16_stride3_vf16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-NEXT: vmovdqa %ymm0, %ymm3
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm2 ^ ymm1))
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2))
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX512-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-NEXT: vmovdqa %ymm2, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm3 & (ymm1 ^ ymm0))
+; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm5
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm6
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2))
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7],ymm4[8,9,10],ymm7[11,12,13,14,15]
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm7
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6],xmm7[7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm0 ^ ymm1))
+; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX512-NEXT: vmovdqa %ymm3, (%rsi)
-; AVX512-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX512-NEXT: vmovdqa %ymm4, (%rdx)
; AVX512-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride3_vf16:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm2 ^ ymm1))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm3 & (ymm1 ^ ymm0))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm5
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2))
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7],ymm4[8,9,10],ymm7[11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6],xmm7[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm0 ^ ymm1))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rdx)
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride3_vf16:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm2 ^ ymm1))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm3 & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm5
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm6
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2))
-; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7],ymm4[8,9,10],ymm7[11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm7
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6],xmm7[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm0 ^ ymm1))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX512DQ-NEXT: vmovdqa %ymm3, (%rsi)
-; AVX512DQ-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX512DQ-NEXT: vmovdqa %ymm4, (%rdx)
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i16_stride3_vf16:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm2 ^ ymm1))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7],ymm3[8,9,10],ymm6[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm1 ^ ymm2))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm0 ^ (ymm3 & (ymm1 ^ ymm0))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,6,7,4]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2))
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3,4,5,6,7],ymm4[8,9,10],ymm7[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm7
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6],xmm7[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm0 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm0 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -1501,22 +1497,25 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm6[1],xmm14[2,3],xmm6[4],xmm14[5,6],xmm6[7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
-; AVX-NEXT: vpshufb %xmm15, %xmm14, %xmm14
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1,2],xmm4[3],xmm0[4,5],xmm4[6],xmm0[7]
-; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6],xmm9[7]
-; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm12[0],xmm13[1,2],xmm12[3],xmm13[4,5],xmm12[6],xmm13[7]
-; AVX-NEXT: vpshufb %xmm15, %xmm13, %xmm13
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT: vmovq {{.*#+}} xmm15 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm15, %xmm6, %xmm13
+; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0],xmm14[1,2],xmm4[3],xmm14[4,5],xmm4[6],xmm14[7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT: vpshufb %xmm0, %xmm14, %xmm14
+; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm14
+; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT: vpshufb %xmm15, %xmm9, %xmm15
+; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm11[0,1],xmm10[2],xmm11[3,4],xmm10[5],xmm11[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm12[0],xmm15[1,2],xmm12[3],xmm15[4,5],xmm12[6],xmm15[7]
+; AVX-NEXT: vpshufb %xmm0, %xmm15, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5,6],xmm2[7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
; AVX-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
@@ -1548,252 +1547,258 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX2-LABEL: load_i16_stride3_vf32:
; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX2-NEXT: vmovdqa 128(%rdi), %ymm5
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-NEXT: vmovdqa 96(%rdi), %ymm2
-; AVX2-NEXT: vmovdqa 128(%rdi), %ymm4
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm3
-; AVX2-NEXT: vmovdqa 176(%rdi), %xmm5
-; AVX2-NEXT: vmovdqa 160(%rdi), %xmm6
-; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX2-NEXT: vpshufb %xmm10, %xmm9, %xmm9
-; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7
-; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15]
-; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm9
-; AVX2-NEXT: vmovdqa 80(%rdi), %xmm7
-; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8
-; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
-; AVX2-NEXT: vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm6
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm8 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX2-NEXT: vpshufb %ymm8, %ymm6, %ymm9
+; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2
+; AVX2-NEXT: vmovdqa 80(%rdi), %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX2-NEXT: vpshufb %xmm11, %xmm10, %xmm10
; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm10
+; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3],xmm10[4],xmm6[5,6],xmm10[7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-NEXT: vpshufb %xmm10, %xmm6, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm7
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6],xmm9[7]
+; AVX2-NEXT: vpshufb %xmm10, %xmm9, %xmm10
+; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm7
+; AVX2-NEXT: vmovdqa 160(%rdi), %xmm8
+; AVX2-NEXT: vmovdqa 176(%rdi), %xmm9
+; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX2-NEXT: vpshufb %xmm11, %xmm12, %xmm11
+; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7]
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10
-; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-NEXT: vpshufb %ymm12, %ymm10, %ymm10
-; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX2-NEXT: vpshufb %xmm14, %xmm13, %xmm13
-; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
+; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm10
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm12 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX2-NEXT: vpshufb %ymm12, %ymm10, %ymm13
+; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX2-NEXT: vpshufb %xmm15, %xmm14, %xmm14
+; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7],ymm13[8,9,10],ymm14[11,12,13,14,15]
+; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm14
+; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2],xmm10[3,4],xmm14[5],xmm10[6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-NEXT: vpshufb %xmm14, %xmm10, %xmm10
; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
-; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15]
+; AVX2-NEXT: vpblendvb %ymm11, %ymm5, %ymm4, %ymm11
+; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
+; AVX2-NEXT: vpshufb %xmm14, %xmm13, %xmm13
; AVX2-NEXT: vpshufb %ymm12, %ymm11, %ymm11
-; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
-; AVX2-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
+; AVX2-NEXT: vpshufb %xmm15, %xmm12, %xmm12
; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX2-NEXT: vpshufb %xmm6, %xmm5, %xmm5
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-NEXT: vpblendvb %ymm12, %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX2-NEXT: vpshufb %ymm5, %ymm4, %ymm4
+; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6],xmm9[7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX2-NEXT: vpshufb %xmm9, %xmm8, %xmm8
+; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5,6,7]
; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
-; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX2-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
+; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT: vmovdqa %ymm3, 32(%rsi)
-; AVX2-NEXT: vmovdqa %ymm9, (%rsi)
-; AVX2-NEXT: vmovdqa %ymm10, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm11, (%rdx)
-; AVX2-NEXT: vmovdqa %ymm2, 32(%rcx)
+; AVX2-NEXT: vmovdqa %ymm7, 32(%rsi)
+; AVX2-NEXT: vmovdqa %ymm6, (%rsi)
+; AVX2-NEXT: vmovdqa %ymm11, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm10, (%rdx)
+; AVX2-NEXT: vmovdqa %ymm4, 32(%rcx)
; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i16_stride3_vf32:
; AVX2-FP: # %bb.0:
+; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm5
; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm2
-; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm4
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FP-NEXT: vpshufb %ymm8, %ymm3, %ymm3
-; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm5
-; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm6
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
-; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15]
-; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm9
-; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm7
-; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm8
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm6
+; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm6, %ymm9
+; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm2
+; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FP-NEXT: vpshufb %xmm11, %xmm10, %xmm10
; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm10
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3],xmm10[4],xmm6[5,6],xmm10[7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm6, %xmm6
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm7
+; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6],xmm9[7]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm10
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm7
+; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm8
+; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm9
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm11, %xmm12, %xmm11
+; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7]
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-FP-NEXT: vpshufb %ymm12, %ymm10, %ymm10
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX2-FP-NEXT: vpshufb %xmm14, %xmm13, %xmm13
-; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
+; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm10
+; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX2-FP-NEXT: vpshufb %ymm12, %ymm10, %ymm13
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FP-NEXT: vpshufb %xmm15, %xmm14, %xmm14
+; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7],ymm13[8,9,10],ymm14[11,12,13,14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm14
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2],xmm10[3,4],xmm14[5],xmm10[6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FP-NEXT: vpshufb %xmm14, %xmm10, %xmm10
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15]
+; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm5, %ymm4, %ymm11
+; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm14, %xmm13, %xmm13
; AVX2-FP-NEXT: vpshufb %ymm12, %ymm11, %ymm11
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm15, %xmm12, %xmm12
; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
-; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm5, %ymm4, %ymm4
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6],xmm9[7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm9, %xmm8, %xmm8
+; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5,6,7]
; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
+; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%rsi)
-; AVX2-FP-NEXT: vmovdqa %ymm9, (%rsi)
-; AVX2-FP-NEXT: vmovdqa %ymm10, 32(%rdx)
-; AVX2-FP-NEXT: vmovdqa %ymm11, (%rdx)
-; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rcx)
+; AVX2-FP-NEXT: vmovdqa %ymm7, 32(%rsi)
+; AVX2-FP-NEXT: vmovdqa %ymm6, (%rsi)
+; AVX2-FP-NEXT: vmovdqa %ymm11, 32(%rdx)
+; AVX2-FP-NEXT: vmovdqa %ymm10, (%rdx)
+; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%rcx)
; AVX2-FP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i16_stride3_vf32:
; AVX2-FCP: # %bb.0:
+; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2
-; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm3
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm5
-; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm6
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm9[3,4,5,6,7],ymm3[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm7
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm7[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6],ymm9[7],ymm7[8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13,14],ymm9[15]
-; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm9
-; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm7
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm8
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm6
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm9
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
+; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10
; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm10
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1],xmm6[2,3],xmm10[4],xmm6[5,6],xmm10[7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm7
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6],xmm9[7]
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm10
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm7
+; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm8
+; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm9
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm11
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7],ymm7[8,9,10],ymm11[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7]
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm2, %ymm10
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm10, %ymm10
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm13, %xmm13
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1,2],ymm13[3,4,5,6,7],ymm10[8,9,10],ymm13[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
+; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm10
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm10, %ymm13
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm14, %xmm14
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7],ymm13[8,9,10],ymm14[11,12,13,14,15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm14[2],xmm10[3,4],xmm14[5],xmm10[6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm10
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7,8,9],ymm13[10],ymm11[11,12],ymm13[13],ymm11[14,15]
+; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm5, %ymm4, %ymm11
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm13, %xmm13
; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm8[2],xmm9[3,4],xmm8[5],xmm9[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm12
; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm5, %ymm4, %ymm4
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6],xmm9[7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm8
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5,6,7]
; AVX2-FCP-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
+; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%rsi)
-; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rsi)
-; AVX2-FCP-NEXT: vmovdqa %ymm10, 32(%rdx)
-; AVX2-FCP-NEXT: vmovdqa %ymm11, (%rdx)
-; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rcx)
+; AVX2-FCP-NEXT: vmovdqa %ymm7, 32(%rsi)
+; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rsi)
+; AVX2-FCP-NEXT: vmovdqa %ymm11, 32(%rdx)
+; AVX2-FCP-NEXT: vmovdqa %ymm10, (%rdx)
+; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rcx)
; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
@@ -1813,55 +1818,54 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX512-NEXT: vmovdqa (%rdi), %ymm8
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm9
-; AVX512-NEXT: vmovdqa %ymm0, %ymm3
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9))
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm8
+; AVX512-NEXT: vmovdqa (%rdi), %ymm9
+; AVX512-NEXT: vmovdqa %ymm0, %ymm10
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (ymm10 & (ymm9 ^ ymm8))
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12
+; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6],xmm12[7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm16
; AVX512-NEXT: vmovdqa %ymm0, %ymm10
; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6))
; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
; AVX512-NEXT: vpshufb %ymm11, %ymm10, %ymm10
-; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT: vmovdqa %ymm12, %ymm13
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8))
-; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX512-NEXT: vpshufb %xmm11, %xmm14, %xmm11
-; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm5 ^ (ymm12 & (ymm6 ^ ymm5))
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm10[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT: vmovdqa %ymm13, %ymm14
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm8 ^ ymm9))
+; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX512-NEXT: vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7],ymm15[8,9,10],ymm7[11,12,13,14,15]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1],xmm12[2],xmm14[3,4],xmm12[5],xmm14[6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm5 ^ (ymm13 & (ymm6 ^ ymm5))
+; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm13[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7],ymm5[8],ymm13[9,10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm9 ^ (ymm0 & (ymm8 ^ ymm9))
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0
@@ -1870,8 +1874,8 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa64 %zmm7, (%rsi)
-; AVX512-NEXT: vmovdqa64 %zmm10, (%rdx)
+; AVX512-NEXT: vmovdqa64 %zmm16, (%rsi)
+; AVX512-NEXT: vmovdqa64 %zmm7, (%rdx)
; AVX512-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1879,77 +1883,78 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-LABEL: load_i16_stride3_vf32:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm5 ^ (ymm1 & (ymm6 ^ ymm5))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm8
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm9
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm1 ^ (ymm6 & (ymm2 ^ ymm1))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm13
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm11
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm7[3,4,5,6,7],ymm5[8,9,10],ymm7[11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm10
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (ymm6 & (ymm10 ^ ymm9))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [24,25,26,3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm6
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13
+; AVX512-FCP-NEXT: vpermi2d %zmm13, %zmm11, %zmm12
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm16
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm11
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm9 ^ ymm10))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm11
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm11[5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm5 ^ (ymm12 & (ymm6 ^ ymm5))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm14
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm1 ^ ymm2))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1],xmm5[2],xmm14[3,4],xmm5[5],xmm14[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm9 ^ (ymm13 & (ymm10 ^ ymm9))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm13[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm13[1,2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7],ymm8[8],ymm13[9,10],ymm8[11],ymm13[12,13],ymm8[14],ymm13[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -1969,55 +1974,54 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm9
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm8
+; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm9
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm8 ^ (ymm10 & (ymm9 ^ ymm8))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm12
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3],xmm12[4],xmm10[5,6],xmm12[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm16
; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6))
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
; AVX512DQ-NEXT: vpshufb %ymm11, %ymm10, %ymm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm13
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm11, %xmm14, %xmm11
-; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm5 ^ (ymm12 & (ymm6 ^ ymm5))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm10[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm14
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm8 ^ ymm9))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm7, %xmm7
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1,2],ymm7[3,4,5,6,7],ymm15[8,9,10],ymm7[11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1],xmm12[2],xmm14[3,4],xmm12[5],xmm14[6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm5 ^ (ymm13 & (ymm6 ^ ymm5))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm13[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7],ymm5[8],ymm13[9,10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm9 ^ (ymm0 & (ymm8 ^ ymm9))
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0
@@ -2026,8 +2030,8 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rsi)
-; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rsi)
+; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -2035,77 +2039,78 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-LABEL: load_i16_stride3_vf32:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm5 ^ (ymm1 & (ymm6 ^ ymm5))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm9 ^ (ymm3 & (ymm8 ^ ymm9))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm1 ^ (ymm6 & (ymm2 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,28,29,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2],ymm11[3,4,5,6,7],ymm10[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,4,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm10, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm10 & (ymm5 ^ ymm6))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm10[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm13
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm9 ^ ymm8))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7,8,9],ymm14[10],ymm13[11,12],ymm14[13],ymm13[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm11
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,6,7,4]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2],ymm7[3,4,5,6,7],ymm5[8,9,10],ymm7[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm5[1],xmm6[2,3],xmm5[4],xmm6[5,6],xmm5[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (ymm6 & (ymm10 ^ ymm9))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5,6],ymm7[7],ymm6[8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13,14],ymm7[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [24,25,26,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm13, %zmm11, %zmm12
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm8, %zmm16
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm11
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm9 ^ ymm10))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm11[5,6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm5 ^ (ymm12 & (ymm6 ^ ymm5))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm12[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7],ymm5[8],ymm12[9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm8 ^ (ymm0 & (ymm9 ^ ymm8))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7],ymm2[8],ymm0[9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm14
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm1 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1],xmm5[2],xmm14[3,4],xmm5[5],xmm14[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm9 ^ (ymm13 & (ymm10 ^ ymm9))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm13[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm13[1,2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7],ymm8[8],ymm13[9,10],ymm8[11],ymm13[12,13],ymm8[14],ymm13[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -2693,896 +2698,939 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX-LABEL: load_i16_stride3_vf64:
; AVX: # %bb.0:
-; AVX-NEXT: subq $408, %rsp # imm = 0x198
-; AVX-NEXT: vmovdqa 176(%rdi), %xmm6
-; AVX-NEXT: vmovdqa 160(%rdi), %xmm5
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6,7]
+; AVX-NEXT: subq $488, %rsp # imm = 0x1E8
+; AVX-NEXT: vmovdqa 176(%rdi), %xmm5
+; AVX-NEXT: vmovdqa 160(%rdi), %xmm13
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm5[2],xmm13[3,4],xmm5[5],xmm13[6,7]
+; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa 144(%rdi), %xmm2
-; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa 144(%rdi), %xmm7
+; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,3,2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
-; AVX-NEXT: vmovdqa 112(%rdi), %xmm10
-; AVX-NEXT: vmovdqa 96(%rdi), %xmm7
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6],xmm10[7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
-; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX-NEXT: vmovdqa 128(%rdi), %xmm0
+; AVX-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
+; AVX-NEXT: vmovdqa 96(%rdi), %xmm14
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
+; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3
+; AVX-NEXT: vmovdqa 128(%rdi), %xmm4
+; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 368(%rdi), %xmm14
-; AVX-NEXT: vmovdqa 352(%rdi), %xmm2
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 368(%rdi), %xmm2
; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm14[2],xmm2[3,4],xmm14[5],xmm2[6,7]
-; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 352(%rdi), %xmm15
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1],xmm2[2],xmm15[3,4],xmm2[5],xmm15[6,7]
+; AVX-NEXT: vmovdqa %xmm15, (%rsp) # 16-byte Spill
; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vmovdqa 336(%rdi), %xmm9
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,3,2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa 336(%rdi), %xmm3
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7]
-; AVX-NEXT: vmovdqa 304(%rdi), %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 304(%rdi), %xmm4
+; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 288(%rdi), %xmm3
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
-; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX-NEXT: vmovdqa 320(%rdi), %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7]
+; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3
+; AVX-NEXT: vmovdqa 320(%rdi), %xmm4
+; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 272(%rdi), %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 272(%rdi), %xmm3
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 256(%rdi), %xmm2
; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vmovdqa 240(%rdi), %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa 240(%rdi), %xmm3
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6,7]
-; AVX-NEXT: vmovdqa 208(%rdi), %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa 192(%rdi), %xmm3
+; AVX-NEXT: vmovdqa 208(%rdi), %xmm3
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
-; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX-NEXT: vmovdqa 224(%rdi), %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
+; AVX-NEXT: vmovdqa 192(%rdi), %xmm4
+; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3
+; AVX-NEXT: vmovdqa 224(%rdi), %xmm4
+; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5]
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 80(%rdi), %xmm13
-; AVX-NEXT: vmovdqa 64(%rdi), %xmm11
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
-; AVX-NEXT: vmovdqa %xmm11, (%rsp) # 16-byte Spill
-; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 80(%rdi), %xmm6
+; AVX-NEXT: vmovdqa 64(%rdi), %xmm10
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm6[2],xmm10[3,4],xmm6[5],xmm10[6,7]
+; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,3,2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm4
+; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,3,2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
-; AVX-NEXT: vmovdqa (%rdi), %xmm15
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0],xmm4[1],xmm15[2,3],xmm4[4],xmm15[5,6],xmm4[7]
-; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm0
-; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,1,2,1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2],xmm1[3,4,5,6,7]
+; AVX-NEXT: vmovdqa (%rdi), %xmm1
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb %xmm0, %xmm11, %xmm11
+; AVX-NEXT: vmovdqa 32(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,1,2,1]
; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,6,5]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm12[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5],xmm12[6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1],xmm13[2],xmm5[3,4],xmm13[5],xmm5[6,7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm11
+; AVX-NEXT: vmovq {{.*#+}} xmm9 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT: vpshufb %xmm9, %xmm7, %xmm13
+; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3,4,5,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6],xmm8[7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm10[2],xmm7[3,4],xmm10[5],xmm7[6,7]
-; AVX-NEXT: vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = mem[0],xmm12[1,2],mem[3],xmm12[4,5],mem[6],xmm12[7]
-; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm12
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm14[0,1],mem[2],xmm14[3,4],mem[5],xmm14[6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6],xmm9[7]
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = xmm12[0,1],mem[2],xmm12[3,4],mem[5],xmm12[6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1,2],xmm14[3],xmm12[4,5],xmm14[6],xmm12[7]
-; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm12
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6],xmm3[7]
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0],xmm12[1,2],xmm1[3],xmm12[4,5],xmm1[6],xmm12[7]
-; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm12
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm13[2],xmm11[3,4],xmm13[5],xmm11[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1],xmm8[2],xmm14[3,4],xmm8[5],xmm14[6,7]
+; AVX-NEXT: vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
+; AVX-NEXT: # xmm13 = mem[0],xmm13[1,2],mem[3],xmm13[4,5],mem[6],xmm13[7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT: vpshufb %xmm0, %xmm13, %xmm13
+; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm2
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1],xmm15[2],xmm2[3,4],xmm15[5],xmm2[6,7]
+; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm11
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm13
+; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3,4,5,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3],xmm15[4],xmm0[5,6],xmm15[7]
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = mem[0,1],xmm12[2],mem[3,4],xmm12[5],mem[6,7]
-; AVX-NEXT: vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = mem[0],xmm12[1,2],mem[3],xmm12[4,5],mem[6],xmm12[7]
-; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm2
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm12[2],xmm15[3,4],xmm12[5],xmm15[6,7]
+; AVX-NEXT: vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
+; AVX-NEXT: # xmm13 = mem[0],xmm13[1,2],mem[3],xmm13[4,5],mem[6],xmm13[7]
+; AVX-NEXT: vpshufb %xmm0, %xmm13, %xmm13
+; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm11
+; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0,1],xmm10[2],xmm6[3,4],xmm10[5],xmm6[6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT: vpshufb %xmm9, %xmm4, %xmm13
+; AVX-NEXT: vmovdqa %xmm9, %xmm4
+; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0,1],xmm3[2],xmm1[3,4],xmm3[5],xmm1[6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm1[0],xmm13[1,2],xmm1[3],xmm13[4,5],xmm1[6],xmm13[7]
+; AVX-NEXT: vpshufb %xmm0, %xmm13, %xmm13
+; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm13, %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0,1],xmm6[2],xmm3[3,4],xmm6[5],xmm3[6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm11
+; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3,4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = xmm11[0,1],mem[2],xmm11[3,4],mem[5],xmm11[6,7]
+; AVX-NEXT: vpblendw $73, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = mem[0],xmm11[1,2],mem[3],xmm11[4,5],mem[6],xmm11[7]
+; AVX-NEXT: vpshufb %xmm0, %xmm11, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[2,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm10[0,1],xmm7[2],xmm10[3,4],xmm7[5],xmm10[6,7]
-; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3],mem[4],xmm12[5,6],mem[7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX-NEXT: vpshufb %xmm1, %xmm12, %xmm12
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm8
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpblendw $109, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0],xmm0[1],mem[2,3],xmm0[4],mem[5,6],xmm0[7]
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[2,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm10 # 16-byte Folded Reload
-; AVX-NEXT: # xmm10 = mem[0,1],xmm5[2],mem[3,4],xmm5[5],mem[6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6],xmm14[7]
-; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm9
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0
-; AVX-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload
-; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT: # xmm7 = xmm5[0],mem[1],xmm5[2,3],mem[4],xmm5[5,6],mem[7]
-; AVX-NEXT: vpshufb %xmm2, %xmm7, %xmm7
-; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3,4,5,6,7]
-; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = xmm4[0,1],mem[2],xmm4[3,4],mem[5],xmm4[6,7]
-; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = xmm4[0],mem[1],xmm4[2,3],mem[4],xmm4[5,6],mem[7]
-; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6],xmm11[7]
-; AVX-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[2,1,2,3]
+; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3],mem[4],xmm0[5,6],mem[7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[2,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7]
+; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3],mem[4],xmm9[5,6],mem[7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX-NEXT: vpshufb %xmm11, %xmm9, %xmm9
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm9
+; AVX-NEXT: vpblendw $109, (%rsp), %xmm2, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[0],xmm2[1],mem[2,3],xmm2[4],mem[5,6],xmm2[7]
+; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0,1],xmm15[2],xmm12[3,4],xmm15[5],xmm12[6,7]
+; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT: # xmm7 = xmm8[0],mem[1],xmm8[2,3],mem[4],xmm8[5,6],mem[7]
+; AVX-NEXT: vpshufb %xmm11, %xmm7, %xmm7
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = xmm2[0],mem[1],xmm2[2,3],mem[4],xmm2[5,6],mem[7]
+; AVX-NEXT: vmovdqa %xmm4, %xmm7
+; AVX-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX-NEXT: vpshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[2,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3,4,5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = xmm4[0,1],mem[2],xmm4[3,4],mem[5],xmm4[6,7]
-; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3],mem[4],xmm4[5,6],mem[7]
-; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm2, (%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm2, 64(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm2, 96(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm2, 32(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm2, 64(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm2, (%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm2, 96(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
-; AVX-NEXT: vmovaps %ymm1, 64(%rcx)
-; AVX-NEXT: vmovaps %ymm3, (%rcx)
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vpblendw $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2],xmm2[3,4],mem[5],xmm2[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
+; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6],xmm3[7]
+; AVX-NEXT: vpshufb %xmm7, %xmm2, %xmm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[2,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vpblendw $219, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = mem[0,1],xmm3[2],mem[3,4],xmm3[5],mem[6,7]
+; AVX-NEXT: vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3],mem[4],xmm3[5,6],mem[7]
+; AVX-NEXT: vpshufb %xmm11, %xmm3, %xmm3
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, (%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 64(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 96(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 32(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 64(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, (%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 96(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 32(%rdx)
+; AVX-NEXT: vmovaps %ymm2, 64(%rcx)
+; AVX-NEXT: vmovaps %ymm1, (%rcx)
; AVX-NEXT: vmovaps %ymm0, 96(%rcx)
-; AVX-NEXT: vmovaps %ymm8, 32(%rcx)
-; AVX-NEXT: addq $408, %rsp # imm = 0x198
+; AVX-NEXT: vmovaps %ymm9, 32(%rcx)
+; AVX-NEXT: addq $488, %rsp # imm = 0x1E8
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: load_i16_stride3_vf64:
; AVX2: # %bb.0:
-; AVX2-NEXT: subq $136, %rsp
-; AVX2-NEXT: vmovdqa (%rdi), %ymm1
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX2-NEXT: subq $200, %rsp
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX2-NEXT: vmovdqa 192(%rdi), %ymm4
; AVX2-NEXT: vmovdqa 224(%rdi), %ymm5
-; AVX2-NEXT: vmovdqa 288(%rdi), %ymm9
-; AVX2-NEXT: vmovdqa 320(%rdi), %ymm10
+; AVX2-NEXT: vmovdqa 288(%rdi), %ymm10
+; AVX2-NEXT: vmovdqa 320(%rdi), %ymm11
; AVX2-NEXT: vmovdqa 96(%rdi), %ymm12
; AVX2-NEXT: vmovdqa 128(%rdi), %ymm13
-; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3
-; AVX2-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8
-; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6
-; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-NEXT: vpblendvb %ymm3, %ymm12, %ymm13, %ymm8
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2,3],xmm2[4],xmm8[5,6],xmm2[7]
+; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpblendvb %ymm3, %ymm10, %ymm11, %ymm2
+; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm6
+; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-NEXT: vpblendvb %ymm14, %ymm13, %ymm12, %ymm7
+; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6
-; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13
-; AVX2-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm6
-; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm10
+; AVX2-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm9
+; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm13
+; AVX2-NEXT: vpblendvb %ymm7, %ymm11, %ymm10, %ymm9
+; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm9
+; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX2-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 176(%rdi), %xmm5
-; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm11
-; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm14
-; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm6
-; AVX2-NEXT: vmovdqa 160(%rdi), %xmm7
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa 368(%rdi), %xmm15
-; AVX2-NEXT: vmovdqa 352(%rdi), %xmm4
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7]
-; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15]
-; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa %ymm3, %ymm2
-; AVX2-NEXT: vmovdqa 272(%rdi), %xmm8
-; AVX2-NEXT: vmovdqa 256(%rdi), %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7]
-; AVX2-NEXT: vpshufb %xmm0, %xmm12, %xmm12
-; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm12
-; AVX2-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX2-NEXT: vmovdqa 64(%rdi), %xmm11
-; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7]
-; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX2-NEXT: vpshufb %ymm10, %ymm8, %ymm4
+; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3
+; AVX2-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm15
+; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm12
-; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
-; AVX2-NEXT: vpshufb %ymm9, %ymm12, %ymm12
-; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
-; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm13
-; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX2-NEXT: vpshufb %ymm9, %ymm10, %ymm10
-; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX2-NEXT: vmovdqa 160(%rdi), %xmm11
+; AVX2-NEXT: vmovdqa 176(%rdi), %xmm9
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX2-NEXT: vpshufb %ymm10, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa 352(%rdi), %xmm8
+; AVX2-NEXT: vmovdqa 368(%rdi), %xmm14
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7]
+; AVX2-NEXT: vpshufb %xmm1, %xmm5, %xmm5
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm2[1],xmm6[2,3],xmm2[4],xmm6[5,6],xmm2[7]
+; AVX2-NEXT: vpshufb %ymm10, %ymm6, %ymm5
+; AVX2-NEXT: vmovdqa 256(%rdi), %xmm7
+; AVX2-NEXT: vmovdqa 272(%rdi), %xmm6
+; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm12
; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
-; AVX2-NEXT: vpshufb %ymm9, %ymm12, %ymm9
-; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX2-NEXT: vmovdqa 80(%rdi), %xmm4
+; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
; AVX2-NEXT: vpshufb %xmm1, %xmm12, %xmm1
+; AVX2-NEXT: vpshufb %ymm10, %ymm3, %ymm10
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7]
-; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7]
-; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm10
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6],xmm10[7]
+; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm12
+; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm0
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7]
+; AVX2-NEXT: vpshufb %ymm10, %ymm13, %ymm13
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm8[2],xmm14[3,4],xmm8[5],xmm14[6,7]
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15]
-; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7]
-; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3,4,5,6,7],ymm13[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm10, %ymm15, %ymm3
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm3[2],xmm15[3,4],xmm3[5],xmm15[6,7]
+; AVX2-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-NEXT: vextracti128 $1, %ymm15, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm3[2],xmm15[3,4],xmm3[5],xmm15[6,7]
+; AVX2-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-NEXT: vpshufb %ymm10, %ymm15, %ymm10
+; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm4, (%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm4, 64(%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm4, 96(%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm4, 32(%rsi)
-; AVX2-NEXT: vmovdqa %ymm10, 64(%rdx)
-; AVX2-NEXT: vmovdqa %ymm14, (%rdx)
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7],ymm10[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm10[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm10[1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7],ymm3[8],ymm10[9,10],ymm3[11],ymm10[12,13],ymm3[14],ymm10[15]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX2-NEXT: vpshufb %ymm10, %ymm3, %ymm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX2-NEXT: vpshufb %xmm11, %xmm9, %xmm9
+; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3,4],ymm9[5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm12[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4,5],ymm3[6],ymm12[7],ymm3[8],ymm12[9,10],ymm3[11],ymm12[12,13],ymm3[14],ymm12[15]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7]
+; AVX2-NEXT: vpshufb %ymm10, %ymm3, %ymm3
+; AVX2-NEXT: vpshufb %xmm11, %xmm8, %xmm8
+; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm12[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12,13],ymm8[14],ymm12[15]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX2-NEXT: vpshufb %ymm10, %ymm8, %ymm7
+; AVX2-NEXT: vpshufb %xmm11, %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm8[2,3,0,1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
+; AVX2-NEXT: vpshufb %ymm10, %ymm7, %ymm7
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX2-NEXT: vpshufb %xmm11, %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm5, (%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm5, 64(%rsi)
+; AVX2-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm5, 96(%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm5, 32(%rsi)
+; AVX2-NEXT: vmovdqa %ymm2, 64(%rdx)
+; AVX2-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-NEXT: vmovdqa %ymm13, 96(%rdx)
-; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-NEXT: vmovdqa %ymm3, 64(%rcx)
-; AVX2-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX2-NEXT: vmovdqa %ymm1, 96(%rcx)
-; AVX2-NEXT: vmovdqa %ymm5, 32(%rcx)
-; AVX2-NEXT: addq $136, %rsp
+; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-NEXT: vmovdqa %ymm6, 64(%rcx)
+; AVX2-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm3, 96(%rcx)
+; AVX2-NEXT: vmovdqa %ymm9, 32(%rcx)
+; AVX2-NEXT: addq $200, %rsp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i16_stride3_vf64:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: subq $136, %rsp
-; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1
-; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX2-FP-NEXT: subq $200, %rsp
+; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm4
; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm5
-; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm9
-; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm10
+; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm10
+; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm11
; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm12
; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm13
-; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3
-; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8
-; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6
-; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm12, %ymm13, %ymm8
+; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm2
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2,3],xmm2[4],xmm8[5,6],xmm2[7]
+; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm10, %ymm11, %ymm2
+; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm6
+; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm13, %ymm12, %ymm7
+; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6
-; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13
-; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm6
-; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm10
+; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm9
+; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm13
+; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm11, %ymm10, %ymm9
+; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm9
+; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX2-FP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm5
-; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm11
-; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm14
-; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm6
-; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm7
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm15
-; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm4
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15]
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FP-NEXT: vmovdqa %ymm3, %ymm2
-; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm8
-; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm3
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm12
-; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15]
-; AVX2-FP-NEXT: vpshufb %ymm2, %ymm1, %ymm12
-; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm11
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm0
-; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm8, %ymm4
+; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3
+; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm15
+; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm12
-; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
-; AVX2-FP-NEXT: vpshufb %ymm9, %ymm12, %ymm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm13, %xmm13
-; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX2-FP-NEXT: vpshufb %ymm9, %ymm10, %ymm10
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm11
+; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm9
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
+; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm8
+; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm14
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm5
+; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm2
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm2[1],xmm6[2,3],xmm2[4],xmm6[5,6],xmm2[7]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm6, %ymm5
+; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm7
+; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm6
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm12
; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
-; AVX2-FP-NEXT: vpshufb %ymm9, %ymm12, %ymm9
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm4
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm1
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm3, %ymm10
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm5, %xmm5
-; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm4
-; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm10
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6],xmm10[7]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm12
+; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm13, %ymm13
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm8[2],xmm14[3,4],xmm8[5],xmm14[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3,4,5,6,7],ymm13[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm15, %ymm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm3[2],xmm15[3,4],xmm3[5],xmm15[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FP-NEXT: vextracti128 $1, %ymm15, %xmm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm3[2],xmm15[3,4],xmm3[5],xmm15[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm15, %ymm10
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm2
; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi)
-; AVX2-FP-NEXT: vmovdqa %ymm10, 64(%rdx)
-; AVX2-FP-NEXT: vmovdqa %ymm14, (%rdx)
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7],ymm10[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm10[2,3,0,1]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm10[1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7],ymm3[8],ymm10[9,10],ymm3[11],ymm10[12,13],ymm3[14],ymm10[15]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm11, %xmm9, %xmm9
+; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3,4],ymm9[5,6,7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm12[2,3,0,1]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4,5],ymm3[6],ymm12[7],ymm3[8],ymm12[9,10],ymm3[11],ymm12[12,13],ymm3[14],ymm12[15]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
+; AVX2-FP-NEXT: vpshufb %xmm11, %xmm8, %xmm8
+; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm12[2,3,0,1]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12,13],ymm8[14],ymm12[15]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm8, %ymm7
+; AVX2-FP-NEXT: vpshufb %xmm11, %xmm6, %xmm6
+; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[2,3,0,1]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
+; AVX2-FP-NEXT: vpshufb %ymm10, %ymm7, %ymm7
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX2-FP-NEXT: vpshufb %xmm11, %xmm4, %xmm4
+; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm5, (%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rsi)
+; AVX2-FP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm5, 96(%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm5, 32(%rsi)
+; AVX2-FP-NEXT: vmovdqa %ymm2, 64(%rdx)
+; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-FP-NEXT: vmovdqa %ymm13, 96(%rdx)
-; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-FP-NEXT: vmovdqa %ymm3, 64(%rcx)
-; AVX2-FP-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX2-FP-NEXT: vmovdqa %ymm1, 96(%rcx)
-; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%rcx)
-; AVX2-FP-NEXT: addq $136, %rsp
+; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-FP-NEXT: vmovdqa %ymm6, 64(%rcx)
+; AVX2-FP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX2-FP-NEXT: vmovdqa %ymm3, 96(%rcx)
+; AVX2-FP-NEXT: vmovdqa %ymm9, 32(%rcx)
+; AVX2-FP-NEXT: addq $200, %rsp
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i16_stride3_vf64:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: subq $136, %rsp
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX2-FCP-NEXT: subq $200, %rsp
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm4
; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
-; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm9
-; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm10
+; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm10
+; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm11
; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm12
; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
-; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm12, %ymm13, %ymm0
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm9, %ymm10, %ymm3
-; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm8
-; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
-; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm13, %ymm12, %ymm6
-; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm12, %ymm13, %ymm8
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0],xmm2[1],xmm8[2,3],xmm2[4],xmm8[5,6],xmm2[7]
+; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm10, %ymm11, %ymm2
+; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm4, %ymm5, %ymm6
+; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535]
+; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm13, %ymm12, %ymm7
+; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,65535,0,0,65535,0,0,65535,0,0,65535,0,0,65535,0,0]
-; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm6
-; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm13
-; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm10, %ymm9, %ymm6
-; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm10
+; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm13, %ymm12, %ymm9
+; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm11, %ymm10, %ymm13
+; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm11, %ymm10, %ymm9
+; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm5, %ymm4, %ymm9
+; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm4, %ymm4
-; AVX2-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm5
-; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm11
-; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm14
-; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm6
-; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm7
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1,2],ymm1[3,4,5,6,7],ymm15[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6],ymm1[7],ymm3[8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14],ymm1[15]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm15
-; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm4
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm15[2],xmm4[3,4],xmm15[5],xmm4[6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm8[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2,3],ymm1[4],ymm8[5,6],ymm1[7],ymm8[8],ymm1[9],ymm8[10,11],ymm1[12],ymm8[13,14],ymm1[15]
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm2
-; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm8
-; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm3
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm3[0,1],xmm8[2],xmm3[3,4],xmm8[5],xmm3[6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm12
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm11[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6],ymm1[7],ymm11[8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14],ymm1[15]
-; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm12
-; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm11
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm2[2],xmm11[3,4],xmm2[5],xmm11[6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,6,5,4,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm4
+; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm3
+; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm15
+; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm12
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0,1,2],ymm12[3,4,5,6,7],ymm0[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm13[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
-; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm12
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm13
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,6,7,4]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
-; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm10
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
+; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm11
+; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm9
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1],xmm9[2],xmm11[3,4],xmm9[5],xmm11[6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm8
+; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm14
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm14[2],xmm8[3,4],xmm14[5],xmm8[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm5
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm2[1],xmm6[2,3],xmm2[4],xmm6[5,6],xmm2[7]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm5
+; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm7
+; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm6
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm12
; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0,1,2],ymm12[3,4,5,6,7],ymm10[8,9,10],ymm12[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,6,7,4]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm14[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7,8,9],ymm12[10],ymm14[11,12],ymm12[13],ymm14[14,15]
-; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm9
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm11[2],xmm2[3,4],xmm11[5],xmm2[6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm12[3,4,5,6,7],ymm5[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm4
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm1
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm10
; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7],ymm9[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,7,4]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3],ymm9[4,5],ymm1[6],ymm9[7],ymm1[8],ymm9[9,10],ymm1[11],ymm9[12,13],ymm1[14],ymm9[15]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3],xmm5[4],xmm7[5,6],xmm5[7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm5
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm12[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4,5],ymm1[6],ymm12[7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12,13],ymm1[14],ymm12[15]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3],xmm15[4],xmm4[5,6],xmm15[7]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6],xmm8[7]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm10
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3],xmm10[4],xmm3[5,6],xmm10[7]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm12
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3,4,5,6,7],ymm1[8,9,10],ymm12[11,12,13,14,15]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm0
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3,4],xmm0[5],xmm13[6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm13
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm8[2],xmm14[3,4],xmm8[5],xmm14[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm6[2,3,0,1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7],ymm4[8],ymm6[9,10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2,3],xmm2[4],xmm11[5,6],xmm2[7]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3,4,5,6,7],ymm13[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm15, %ymm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm3[2],xmm15[3,4],xmm3[5],xmm15[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1],xmm3[2],xmm15[3,4],xmm3[5],xmm15[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm15, %ymm10
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2
; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi)
-; AVX2-FCP-NEXT: vmovdqa %ymm10, 64(%rdx)
-; AVX2-FCP-NEXT: vmovdqa %ymm14, (%rdx)
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7],ymm10[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm10[2,3,0,1]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm10[1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7],ymm3[8],ymm10[9,10],ymm3[11],ymm10[12,13],ymm3[14],ymm10[15]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6],xmm9[7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm9
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3,4],ymm9[5,6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm12[2,3,0,1]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm12[1,2],ymm3[3],ymm12[4,5],ymm3[6],ymm12[7],ymm3[8],ymm12[9,10],ymm3[11],ymm12[12,13],ymm3[14],ymm12[15]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6],xmm14[7]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm12[2,3,0,1]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12,13],ymm8[14],ymm12[15]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm7
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm6
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[2,3,0,1]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
+; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm7
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5,6,7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, (%rsi)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rsi)
+; AVX2-FCP-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rsi)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rsi)
+; AVX2-FCP-NEXT: vmovdqa %ymm2, 64(%rdx)
+; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx)
; AVX2-FCP-NEXT: vmovdqa %ymm13, 96(%rdx)
-; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX2-FCP-NEXT: vmovdqa %ymm3, 64(%rcx)
-; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX2-FCP-NEXT: vmovdqa %ymm1, 96(%rcx)
-; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%rcx)
-; AVX2-FCP-NEXT: addq $136, %rsp
+; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX2-FCP-NEXT: vmovdqa %ymm6, 64(%rcx)
+; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%rcx)
+; AVX2-FCP-NEXT: vmovdqa %ymm9, 32(%rcx)
+; AVX2-FCP-NEXT: addq $200, %rsp
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i16_stride3_vf64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm18
-; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm20
-; AVX512-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18))
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm18
+; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm21
+; AVX512-NEXT: vmovdqa %ymm5, %ymm1
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm21 ^ ymm18))
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm5
-; AVX512-NEXT: vmovdqa 272(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa 256(%rdi), %xmm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm19
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512-NEXT: vpshufb %xmm13, %xmm6, %xmm6
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm21
-; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm22
-; AVX512-NEXT: vmovdqa %ymm0, %ymm8
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21))
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm11
-; AVX512-NEXT: vmovdqa 304(%rdi), %xmm8
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2
+; AVX512-NEXT: vmovdqa 304(%rdi), %xmm0
; AVX512-NEXT: vmovdqa 288(%rdi), %xmm4
-; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
-; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm23
-; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11
-; AVX512-NEXT: vmovdqa %ymm0, %ymm5
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23))
-; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
-; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm10
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm15
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm5
-; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
-; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 (%rdi), %ymm24
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm12
-; AVX512-NEXT: vmovdqa %ymm0, %ymm10
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12))
-; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
-; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm7
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
-; AVX512-NEXT: vpshufb %xmm13, %xmm2, %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
-; AVX512-NEXT: vmovdqa %ymm0, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22))
-; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
-; AVX512-NEXT: vmovdqa64 %xmm8, %xmm25
-; AVX512-NEXT: vmovdqa64 %xmm4, %xmm26
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm7
-; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7]
+; AVX512-NEXT: vmovdqa64 %xmm4, %xmm25
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm26
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm22
+; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm23
+; AVX512-NEXT: vmovdqa %ymm5, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm23 ^ ymm22))
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm9
+; AVX512-NEXT: vmovdqa 272(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 256(%rdi), %xmm4
+; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7]
+; AVX512-NEXT: vmovdqa64 %xmm4, %xmm27
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm28
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm11
+; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3,4,5,6,7],ymm9[8,9,10],ymm11[11,12,13,14,15]
+; AVX512-NEXT: vmovdqa 208(%rdi), %xmm0
+; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm29
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm12
+; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm16
+; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm24
+; AVX512-NEXT: vmovdqa 160(%rdi), %ymm12
+; AVX512-NEXT: vmovdqa %ymm5, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm12 ^ ymm24))
+; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6],ymm9[7],ymm3[8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13,14],ymm9[15]
+; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm7
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm14
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0],xmm7[1],xmm14[2,3],xmm7[4],xmm14[5,6],xmm7[7]
+; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm15
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7]
+; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512-NEXT: vmovdqa %ymm5, %ymm0
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm19 ^ ymm1))
+; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm6
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm4
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7]
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17
+; AVX512-NEXT: vmovdqa %ymm5, %ymm0
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm24 ^ ymm12))
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7]
+; AVX512-NEXT: vmovdqa64 %xmm7, %xmm30
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
; AVX512-NEXT: vmovdqa %ymm13, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20))
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa64 %xmm19, %xmm8
-; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
-; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm14
-; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
-; AVX512-NEXT: vmovdqa %ymm0, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11))
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
-; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
-; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm6
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vmovdqa %ymm13, %ymm6
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24))
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm4
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
-; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23))
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm1 ^ ymm19))
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm7 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm11
+; AVX512-NEXT: vmovdqa64 %ymm7, %ymm31
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512-NEXT: vpshufb %xmm11, %xmm2, %xmm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm20
+; AVX512-NEXT: vmovdqa %ymm5, %ymm0
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm18 ^ ymm21))
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm7
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm4
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
+; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %xmm27, %xmm9
+; AVX512-NEXT: vmovdqa64 %xmm28, %xmm8
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-NEXT: vmovdqa %ymm13, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm22 ^ ymm23))
+; AVX512-NEXT: vmovdqa64 %ymm31, %ymm6
+; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm6
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512-NEXT: vmovdqa64 %xmm29, %xmm6
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3,4],xmm6[5],xmm3[6,7]
+; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm13 & (ymm12 ^ ymm24))
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm12[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7],ymm2[8],ymm12[9,10],ymm2[11],ymm12[12,13],ymm2[14],ymm12[15]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24))
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm4
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
+; AVX512-NEXT: vpshufb %ymm11, %ymm2, %ymm2
+; AVX512-NEXT: vmovdqa64 %xmm30, %xmm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2],xmm3[3,4],xmm14[5],xmm3[6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm5 & (ymm1 ^ ymm19))
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7],ymm3[8],ymm1[9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
+; AVX512-NEXT: vpshufb %ymm11, %ymm1, %ymm1
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm15[1],xmm10[2,3],xmm15[4],xmm10[5,6],xmm15[7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21))
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm26, %xmm7
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
-; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm18 ^ (ymm13 & (ymm21 ^ ymm18))
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX512-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %ymm11, %ymm2, %ymm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm22 ^ ymm23))
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20))
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
-; AVX512-NEXT: vpshufb %ymm11, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7]
; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi)
; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rsi)
-; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx)
-; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx)
-; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rdx)
+; AVX512-NEXT: vmovdqa64 %zmm20, (%rdx)
+; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rcx)
; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -3590,162 +3638,169 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-LABEL: load_i16_stride3_vf64:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5
-; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm19
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21
-; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm8
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
-; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm8
+; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm21
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm21 ^ ymm20))
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm3
+; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm4
+; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm25
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm26
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7],ymm3[8,9,10],ymm5[11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm29
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm22
+; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm23
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm23 ^ ymm22))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6],ymm8[7],ymm2[8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13,14],ymm8[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm11
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [24,25,26,3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm2
; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10
-; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm15
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm27
+; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm28
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm13
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13
+; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm16, %zmm11
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm17
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm11
; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm24
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm10
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7
-; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm10
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm24 ^ ymm11))
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm8
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm15
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm8[2],xmm15[3,4],xmm8[5],xmm15[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6],xmm6[7]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm18
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm5
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm5 ^ ymm18))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7],ymm1[8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13,14],ymm4[15]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm4
+; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm12
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0],xmm12[1],xmm1[2,3],xmm12[4],xmm1[5,6],xmm12[7]
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm2
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
+; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm4
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm19
; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm25
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm26
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm5 ^ (ymm2 & (ymm18 ^ ymm5))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm12[2],xmm1[3,4],xmm12[5],xmm1[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm24 ^ (ymm2 & (ymm11 ^ ymm24))
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm15[2],xmm8[3,4],xmm15[5],xmm8[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm30
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm13, %xmm13
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm13[3,4,5,6,7],ymm9[8,9,10],ymm13[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm16
; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm23 ^ (ymm2 & (ymm22 ^ ymm23))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm13
+; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0,1],xmm9[2],xmm13[3,4],xmm9[5],xmm13[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm14
+; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm10
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm14[2],xmm10[3,4],xmm14[5],xmm10[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm4
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm21 ^ (ymm4 & (ymm20 ^ ymm21))
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm8
; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm7 & (ymm5 ^ ymm18))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1],xmm1[2],xmm12[3,4],xmm1[5],xmm12[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm24 ^ (ymm0 & (ymm11 ^ ymm24))
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4
-; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm7
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0],xmm6[1],xmm15[2,3],xmm6[4],xmm15[5,6],xmm6[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm22 ^ (ymm7 & (ymm23 ^ ymm22))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7],ymm3[8],ymm7[9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm13[2],xmm9[3,4],xmm13[5],xmm9[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm20 ^ ymm21))
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm10[1],xmm14[2,3],xmm10[4],xmm14[5,6],xmm10[7]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4
; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rdx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512-FCP-NEXT: vzeroupper
@@ -3753,164 +3808,170 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-LABEL: load_i16_stride3_vf64:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm18
-; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm20
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18))
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm18
+; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm21
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm21 ^ ymm18))
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm5
-; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm1
-; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm19
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-NEXT: vpshufb %xmm13, %xmm6, %xmm6
-; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm21
-; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm22
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm11
-; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm8
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2
+; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
-; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm23
-; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm11
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm5
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm10
-; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm15
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm24
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm12
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm10
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
-; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm7
-; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm25
-; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm26
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm7
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm0[1],xmm4[2,3],xmm0[4],xmm4[5,6],xmm0[7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm25
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm26
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm22
+; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm23
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm23 ^ ymm22))
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm6 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX512DQ-NEXT: vpshufb %ymm6, %ymm2, %ymm9
+; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm0
+; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm27
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm28
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm11, %xmm11
+; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3,4,5,6,7],ymm9[8,9,10],ymm11[11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm29
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm12, %xmm12
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm16
+; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm24
+; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm12
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm24 ^ (ymm3 & (ymm12 ^ ymm24))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5,6],ymm9[7],ymm3[8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13,14],ymm9[15]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm7
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm14
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0],xmm7[1],xmm14[2,3],xmm7[4],xmm14[5,6],xmm7[7]
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm15
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm10
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1],xmm15[2],xmm10[3,4],xmm15[5],xmm10[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm4
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm19
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm19 ^ ymm1))
+; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm6
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6],xmm4[7]
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm17
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm24 ^ ymm12))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm30
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm8
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm14
-; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm6
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm6
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm1 ^ ymm19))
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm10[2],xmm15[3,4],xmm10[5],xmm15[6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm7 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm11
+; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm31
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm20
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm0
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm18 ^ ymm21))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm7
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm9
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm23 ^ (ymm3 & (ymm22 ^ ymm23))
+; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm6
+; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm6
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm6
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3,4],xmm6[5],xmm3[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm13 & (ymm12 ^ ymm24))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm12[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7],ymm2[8],ymm12[9,10],ymm2[11],ymm12[12,13],ymm2[14],ymm12[15]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm2, %ymm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm14[2],xmm3[3,4],xmm14[5],xmm3[6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm19 ^ (ymm5 & (ymm1 ^ ymm19))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7],ymm3[8],ymm1[9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm15[1],xmm10[2,3],xmm15[4],xmm10[5,6],xmm15[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm7
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm18 ^ (ymm13 & (ymm21 ^ ymm18))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm13[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm13[1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7],ymm2[8],ymm13[9,10],ymm2[11],ymm13[12,13],ymm2[14],ymm13[15]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm22 ^ ymm23))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6],xmm8[7]
; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi)
; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rsi)
-; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rdx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rcx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -3918,162 +3979,169 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-LABEL: load_i16_stride3_vf64:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm20
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm20 ^ ymm18))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7],ymm1[8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14],ymm2[15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm19
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,4,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm21
-; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm22
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm21 ^ (ymm8 & (ymm22 ^ ymm21))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7],ymm8[8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13,14],ymm9[15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm21
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm21 ^ ymm20))
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0,16,17,22,23,28,29,0,0]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm25
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm26
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,14,15,0,1,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7],ymm3[8,9,10],ymm5[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm29
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm22
+; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm23
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm23 ^ ymm22))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5,6],ymm8[7],ymm2[8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13,14],ymm8[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,12,13,2,3,8,9,14,15,4,5,10,11,16,17,22,23,28,29,18,19,24,25,30,31,20,21,26,27]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm11
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [24,25,26,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm2
; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm5, %zmm16
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm23 ^ (ymm5 & (ymm11 ^ ymm23))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm5[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3],ymm12[4],ymm5[5,6],ymm12[7],ymm5[8],ymm12[9],ymm5[10,11],ymm12[12],ymm5[13,14],ymm12[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm15
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm5[0],xmm15[1],xmm5[2,3],xmm15[4],xmm5[5,6],xmm15[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm27
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm28
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm13
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm16, %zmm11
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm11
; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm24
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm12 ^ (ymm10 & (ymm24 ^ ymm12))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm10[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm1[1],ymm10[2,3],ymm1[4],ymm10[5,6],ymm1[7],ymm10[8],ymm1[9],ymm10[10,11],ymm1[12],ymm10[13,14],ymm1[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm10[2],xmm1[3,4],xmm10[5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1,2],ymm2[3,4,5,6,7],ymm7[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm2, %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm24 ^ ymm11))
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm15
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm8[2],xmm15[3,4],xmm8[5],xmm15[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6],xmm6[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm18
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm5 ^ ymm18))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7],ymm1[8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13,14],ymm4[15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm12
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0],xmm12[1],xmm1[2,3],xmm12[4],xmm1[5,6],xmm12[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm2
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm4
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm19
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm22 ^ (ymm2 & (ymm21 ^ ymm22))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm25
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm26
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm2[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm20 ^ (ymm2 & (ymm18 ^ ymm20))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,8,9,14,15,4,5,12,13,10,11,0,1,6,7,18,19,24,25,30,31,20,21,28,29,26,27,16,17,22,23]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm5 ^ (ymm2 & (ymm18 ^ ymm5))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13,18,19,24,25,30,31,20,21,26,27,16,17,22,23,28,29]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm12[2],xmm1[3,4],xmm12[5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,8,9,14,15,4,5,10,11,10,11,10,11,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm24 ^ (ymm2 & (ymm11 ^ ymm24))
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm15[2],xmm8[3,4],xmm15[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm30
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm14, %xmm14
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0,1,2],ymm14[3,4,5,6,7],ymm2[8,9,10],ymm14[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,7,4]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm19
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm13, %xmm13
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0,18,19,24,25,30,31,0,0]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm13[3,4,5,6,7],ymm9[8,9,10],ymm13[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm16
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm11 ^ (ymm2 & (ymm23 ^ ymm11))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1],xmm15[2],xmm5[3,4],xmm15[5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm2[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm6
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm24 ^ (ymm6 & (ymm12 ^ ymm24))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm1[2],xmm10[3,4],xmm1[5],xmm10[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm23 ^ (ymm2 & (ymm22 ^ ymm23))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm9
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm13[0,1],xmm9[2],xmm13[3,4],xmm9[5],xmm13[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm10
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1],xmm14[2],xmm10[3,4],xmm14[5],xmm10[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm21 ^ (ymm4 & (ymm20 ^ ymm21))
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm8
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,4]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2],xmm4[3,4],xmm8[5],xmm4[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm23 ^ (ymm13 & (ymm11 ^ ymm23))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm18 ^ (ymm7 & (ymm5 ^ ymm18))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7],ymm3[8],ymm5[9,10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0,1],xmm1[2],xmm12[3,4],xmm1[5],xmm12[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm24 ^ (ymm0 & (ymm11 ^ ymm24))
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2],ymm3[3],ymm11[4,5],ymm3[6],ymm11[7],ymm3[8],ymm11[9,10],ymm3[11],ymm11[12,13],ymm3[14],ymm11[15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,16,17,22,23,28,29,18,19,24,25,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm5[2],xmm15[3,4],xmm5[5],xmm15[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm24 ^ (ymm0 & (ymm12 ^ ymm24))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm12[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm12[1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7],ymm4[8],ymm12[9,10],ymm4[11],ymm12[12,13],ymm4[14],ymm12[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3],xmm10[4],xmm1[5,6],xmm10[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm21 ^ (ymm13 & (ymm22 ^ ymm21))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm13[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm13[1,2],ymm3[3],ymm13[4,5],ymm3[6],ymm13[7],ymm3[8],ymm13[9,10],ymm3[11],ymm13[12,13],ymm3[14],ymm13[15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0],xmm6[1],xmm15[2,3],xmm6[4],xmm15[5,6],xmm6[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,6,7,12,13,2,3,8,9,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm22 ^ (ymm7 & (ymm23 ^ ymm22))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm7[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7],ymm3[8],ymm7[9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1],xmm13[2],xmm9[3,4],xmm13[5],xmm9[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm18 ^ ymm20))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm20 ^ ymm21))
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7],ymm4[8],ymm0[9,10],ymm4[11],ymm0[12,13],ymm4[14],ymm0[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3],xmm8[4],xmm4[5,6],xmm8[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm10[1],xmm14[2,3],xmm10[4],xmm14[5,6],xmm10[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index ae4f85ce42a19..c21bc6927680f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -1312,15 +1312,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3
; AVX512-NEXT: vpmovqw %zmm3, %xmm6
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm6
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm7
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm6
; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm7
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -1334,10 +1334,10 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,0,2,3,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
@@ -1348,8 +1348,8 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
@@ -1365,44 +1365,46 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-FCP-LABEL: load_i16_stride4_vf16:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
-; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm2
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm4
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
-; AVX512-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm7
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14]
-; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm8, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
-; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm4, %zmm3
-; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm2
+; AVX512-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm3
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,10,10,11,8,10,12,14]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm6
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7
+; AVX512-FCP-NEXT: vpermd %zmm7, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm8
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,4,6,12,14]
+; AVX512-FCP-NEXT: vpermt2d %ymm8, %ymm9, %ymm3
+; AVX512-FCP-NEXT: vpmovqw %zmm6, %xmm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm6, %zmm5
+; AVX512-FCP-NEXT: vpmovqw %zmm5, %xmm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,2,3,1,3,5,7]
+; AVX512-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2
; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
-; AVX512-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5
-; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm8, %ymm5
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0
-; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpsrlq $48, %zmm4, %zmm3
-; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rsi)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [9,11,10,11,9,11,13,15]
+; AVX512-FCP-NEXT: vpermd %zmm7, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4
+; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm9, %ymm0
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm6, %zmm4
+; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpsrlq $48, %zmm6, %zmm4
+; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rsi)
; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rdx)
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r8)
@@ -1426,15 +1428,15 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm3
; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm6
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm6
-; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm7
+; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm6
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm7
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -1448,10 +1450,10 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,0,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
@@ -1462,8 +1464,8 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
@@ -1479,44 +1481,46 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-FCP-LABEL: load_i16_stride4_vf16:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
-; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm7
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14]
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm8, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT: vpermd %zmm7, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm8
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,4,6,12,14]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm8, %ymm9, %ymm3
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm6, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2
; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm8, %ymm5
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rsi)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm7, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r8)
@@ -2719,8 +2723,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7]
; AVX512-NEXT: vmovdqa 224(%rdi), %xmm7
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
@@ -2728,68 +2732,68 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX512-NEXT: vmovdqa 64(%rdi), %ymm3
; AVX512-NEXT: vpmovqw %ymm3, %xmm3
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm10
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm9
; AVX512-NEXT: vmovdqa 112(%rdi), %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[0,1,0,2,4,5,6,7]
; AVX512-NEXT: vmovdqa 96(%rdi), %xmm4
; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7]
; AVX512-NEXT: vpmovqw %zmm0, %xmm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[0,1,2,3]
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10
+; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm9[0,1,2,3],zmm2[0,1,2,3]
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11
-; AVX512-NEXT: vmovdqa 192(%rdi), %xmm13
-; AVX512-NEXT: vmovdqa 208(%rdi), %xmm14
+; AVX512-NEXT: vmovdqa 208(%rdi), %xmm9
; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT: vpsrlq $16, %zmm1, %zmm9
-; AVX512-NEXT: vpmovqw %zmm9, %xmm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa 192(%rdi), %xmm13
+; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
-; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm9
-; AVX512-NEXT: vpmovqw %zmm9, %xmm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-NEXT: vpsrlq $16, %zmm1, %zmm8
+; AVX512-NEXT: vpmovqw %zmm8, %xmm8
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm12
+; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1]
+; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7]
+; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm10
+; AVX512-NEXT: vpmovqw %zmm10, %xmm10
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[0,1,2,3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,2,0,4,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,2,0,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm12
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,1,2,0,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[2,0,2,3,4,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[3,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[2,0,2,3,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6,7]
; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm13
; AVX512-NEXT: vpmovqw %zmm13, %xmm13
-; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,1,2,0,4,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
@@ -2798,15 +2802,15 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[2,0,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,0,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[2,0,2,3,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm14
; AVX512-NEXT: vpmovqw %zmm14, %xmm14
; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[0,1,2,3]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm13[0,1,2,3],zmm10[0,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
@@ -2824,7 +2828,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[3,1,2,3,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
@@ -2834,7 +2838,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi)
; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx)
-; AVX512-NEXT: vmovdqa64 %zmm12, (%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm10, (%rcx)
; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -2842,79 +2846,81 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-LABEL: load_i16_stride4_vf32:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
-; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6]
-; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
-; AVX512-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm7
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
-; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm9
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm10
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14]
-; AVX512-FCP-NEXT: vpermt2d %ymm7, %ymm11, %ymm10
-; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm10
-; AVX512-FCP-NEXT: vpermd %ymm10, %ymm4, %ymm12
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
-; AVX512-FCP-NEXT: vpermd %ymm14, %ymm4, %ymm15
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm4
-; AVX512-FCP-NEXT: vpermt2d %ymm13, %ymm11, %ymm4
-; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm13
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm1, %zmm9
-; AVX512-FCP-NEXT: vpmovqw %zmm9, %xmm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm9
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm12
-; AVX512-FCP-NEXT: vpmovqw %zmm12, %xmm12
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,2,2,3,0,2,4,6]
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm7
+; AVX512-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm8
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm9
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [8,10,10,11,8,10,12,14]
+; AVX512-FCP-NEXT: vpermd %zmm6, %zmm10, %zmm11
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm12
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,2,2,3,4,6,12,14]
+; AVX512-FCP-NEXT: vpermt2d %ymm12, %ymm13, %ymm9
+; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm12
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7]
-; AVX512-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm5
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm12
-; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm13
-; AVX512-FCP-NEXT: vpermt2d %ymm12, %ymm11, %ymm13
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vpmovqw %zmm12, %xmm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm10
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2
-; AVX512-FCP-NEXT: vpermd %ymm14, %ymm9, %ymm9
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3
-; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm11, %ymm3
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,2,3]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm12
+; AVX512-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm14
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm14, %ymm5
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm10, %zmm10
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm15
+; AVX512-FCP-NEXT: vpermt2d %ymm15, %ymm13, %ymm5
+; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm5[0,1,2,3],zmm9[0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm1, %zmm11
+; AVX512-FCP-NEXT: vpmovqw %zmm11, %xmm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm11
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm10
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm11
+; AVX512-FCP-NEXT: vpmovqw %zmm11, %xmm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,2,3,1,3,5,7]
+; AVX512-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm11
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [9,11,10,11,9,11,13,15]
+; AVX512-FCP-NEXT: vpermd %zmm6, %zmm14, %zmm6
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm15
+; AVX512-FCP-NEXT: vpermt2d %ymm15, %ymm13, %ymm11
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm1, %zmm15
+; AVX512-FCP-NEXT: vpmovqw %zmm15, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm3
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm2
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm4
+; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm13, %ymm3
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4
+; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm11[0,1,2,3]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
; AVX512-FCP-NEXT: vpsrlq $48, %zmm1, %zmm1
; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm0
; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rcx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -2930,8 +2936,8 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7]
; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm7
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,0,2,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
@@ -2939,68 +2945,68 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm3
; AVX512DQ-NEXT: vpmovqw %ymm3, %xmm3
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm10
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm9
; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm3
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm10[0,1,0,2,4,5,6,7]
; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm4
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7]
; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[0,1,2,3]
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm10
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm9[0,1,2,3],zmm2[0,1,2,3]
; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm11
-; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm13
-; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm14
+; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm9
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-NEXT: vpsrlq $16, %zmm1, %zmm9
-; AVX512DQ-NEXT: vpmovqw %zmm9, %xmm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm13
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm9
-; AVX512DQ-NEXT: vpmovqw %zmm9, %xmm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-NEXT: vpsrlq $16, %zmm1, %zmm8
+; AVX512DQ-NEXT: vpmovqw %zmm8, %xmm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm11[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm12
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm12[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm10
+; AVX512DQ-NEXT: vpmovqw %zmm10, %xmm10
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[0,1,2,3]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,2,0,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm12
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[2,0,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[3,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[2,0,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6,7]
; AVX512DQ-NEXT: vpsrlq $32, %zmm1, %zmm13
; AVX512DQ-NEXT: vpmovqw %zmm13, %xmm13
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,1,2,0,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
@@ -3009,15 +3015,15 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[2,0,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm14
; AVX512DQ-NEXT: vpmovqw %zmm14, %xmm14
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[0,1,2,3]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm13[0,1,2,3],zmm10[0,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
@@ -3035,7 +3041,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[3,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
@@ -3045,7 +3051,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi)
; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rcx)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -3053,79 +3059,81 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-LABEL: load_i16_stride4_vf32:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
-; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm6
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm10
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14]
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm7, %ymm11, %ymm10
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm4, %ymm12
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm4, %ymm15
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm4
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm13, %ymm11, %ymm4
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm13
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm1, %zmm9
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm9, %xmm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,2,2,3,0,2,4,6]
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm7
+; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm9
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm10, %zmm11
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm12
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,2,2,3,4,6,12,14]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm12, %ymm13, %ymm9
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm12
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm12
-; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm13
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm12, %ymm11, %ymm13
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm12, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm10
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm11, %ymm3
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm14
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm14, %ymm5
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm10, %zmm10
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm15
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm15, %ymm13, %ymm5
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm5[0,1,2,3],zmm9[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm1, %zmm11
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm11, %xmm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm11
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm11
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm11, %xmm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm10[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm11
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm14, %zmm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm15
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm15, %ymm13, %ymm11
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm1, %zmm15
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm15, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm3
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm13, %ymm3
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm11[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm1, %zmm1
; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -5673,259 +5681,256 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-LABEL: load_i16_stride4_vf64:
; AVX512: # %bb.0:
-; AVX512-NEXT: subq $200, %rsp
-; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm26
-; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm27
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm28
-; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm29
+; AVX512-NEXT: pushq %rax
+; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm23
+; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm24
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm25
+; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm27
; AVX512-NEXT: vmovdqa 192(%rdi), %ymm0
; AVX512-NEXT: vpmovqw %ymm0, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vmovdqa 240(%rdi), %xmm14
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vmovdqa 224(%rdi), %xmm13
-; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vpmovqw %zmm29, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1
-; AVX512-NEXT: vpmovqw %ymm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vmovdqa 224(%rdi), %xmm15
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT: vpmovqw %zmm27, %xmm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2
+; AVX512-NEXT: vpmovqw %ymm2, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vmovdqa 112(%rdi), %xmm12
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm11
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3]
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm13
+; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[0,2,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vpmovqw %zmm28, %xmm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vpmovqw %zmm25, %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3]
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vmovdqa 448(%rdi), %ymm0
; AVX512-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa64 496(%rdi), %xmm24
-; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm24[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 480(%rdi), %xmm23
-; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm23[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vpmovqw %zmm27, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa 320(%rdi), %ymm1
-; AVX512-NEXT: vpmovqw %ymm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vmovdqa64 368(%rdi), %xmm31
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm31[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 352(%rdi), %xmm25
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm25[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vpmovqw %zmm26, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3]
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 320(%rdi), %xmm30
-; AVX512-NEXT: vmovdqa64 336(%rdi), %xmm17
-; AVX512-NEXT: vmovdqa64 448(%rdi), %xmm18
-; AVX512-NEXT: vmovdqa64 464(%rdi), %xmm19
-; AVX512-NEXT: vmovdqa64 64(%rdi), %xmm20
-; AVX512-NEXT: vmovdqa64 80(%rdi), %xmm21
-; AVX512-NEXT: vmovdqa 192(%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 208(%rdi), %xmm1
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vpsrlq $16, %zmm29, %zmm6
-; AVX512-NEXT: vpmovqw %zmm6, %xmm6
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
+; AVX512-NEXT: vmovdqa64 496(%rdi), %xmm30
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm30[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vmovdqa64 480(%rdi), %xmm31
+; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm31[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vpmovqw %zmm24, %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa 320(%rdi), %ymm4
+; AVX512-NEXT: vpmovqw %ymm4, %xmm4
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm9
+; AVX512-NEXT: vmovdqa64 368(%rdi), %xmm28
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm28[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vmovdqa64 352(%rdi), %xmm29
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm29[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
+; AVX512-NEXT: vpmovqw %zmm23, %xmm10
+; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm9[0,1,2,3],zmm2[0,1,2,3]
+; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 336(%rdi), %xmm16
+; AVX512-NEXT: vmovdqa64 464(%rdi), %xmm17
+; AVX512-NEXT: vmovdqa64 80(%rdi), %xmm18
+; AVX512-NEXT: vmovdqa 208(%rdi), %xmm2
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa 192(%rdi), %xmm1
+; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
+; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX512-NEXT: vpsrlq $16, %zmm27, %zmm9
+; AVX512-NEXT: vpmovqw %zmm9, %xmm9
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm21[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm20[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT: vpsrlq $16, %zmm28, %zmm6
-; AVX512-NEXT: vpmovqw %zmm6, %xmm6
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
-; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm19[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm18[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vpsrlq $16, %zmm27, %zmm3
-; AVX512-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm30[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm7
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm18[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
+; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-NEXT: vpsrlq $16, %zmm25, %zmm9
+; AVX512-NEXT: vpmovqw %zmm9, %xmm9
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm7[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm17[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa 448(%rdi), %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-NEXT: vpsrlq $16, %zmm24, %zmm7
+; AVX512-NEXT: vpmovqw %zmm7, %xmm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[0,2,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa64 320(%rdi), %xmm26
+; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm26[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vpsrlq $16, %zmm23, %zmm5
+; AVX512-NEXT: vpmovqw %zmm5, %xmm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm4[0,1,2,3],zmm6[0,1,2,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,2,0,4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %xmm5, %xmm19
+; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,2,0,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT: vpsrlq $16, %zmm26, %zmm4
-; AVX512-NEXT: vpmovqw %zmm4, %xmm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
-; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,2,0,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm22
-; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,0,2,3,4,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,0,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT: vpsrlq $32, %zmm29, %zmm1
-; AVX512-NEXT: vpmovqw %zmm1, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm21[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm21
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm20[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vpsrlq $32, %zmm28, %zmm8
-; AVX512-NEXT: vpmovqw %zmm8, %xmm8
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm1[0,1,2,3]
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm24[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm23[3,1,2,3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vpsrlq $32, %zmm27, %zmm2
+; AVX512-NEXT: vpmovqw %zmm2, %xmm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,2,0,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm18[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %xmm2, %xmm18
+; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,0,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-NEXT: vpsrlq $32, %zmm25, %zmm7
+; AVX512-NEXT: vpmovqw %zmm7, %xmm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm3[0,1,2,3],zmm4[0,1,2,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm30[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,2,0,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm31[3,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,2,0,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm18[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm31[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,1,2,0,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm17[3,1,2,3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm30 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm17[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[2,0,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm17 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm28[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,1,2,0,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm29[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,1,2,0,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm28 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm16[3,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm30[3,1,2,3]
+; AVX512-NEXT: vmovdqa64 %xmm8, %xmm29
+; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm26[3,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm9[2,0,2,3,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
-; AVX512-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14
-; AVX512-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7]
-; AVX512-NEXT: vpsrlq $32, %zmm27, %zmm14
+; AVX512-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm14
+; AVX512-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7]
+; AVX512-NEXT: vpsrlq $32, %zmm24, %zmm14
; AVX512-NEXT: vpmovqw %zmm14, %xmm14
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm14
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7]
-; AVX512-NEXT: vpsrlq $32, %zmm26, %zmm14
+; AVX512-NEXT: vpsrlq $32, %zmm23, %zmm14
; AVX512-NEXT: vpmovqw %zmm14, %xmm14
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm5[0,1,2,3]
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm0
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm0[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm0
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[0,1,3,1,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[0,1,3,1,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,1,3,1,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vpsrlq $48, %zmm29, %zmm3
-; AVX512-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm3
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm16, %xmm5
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT: vpsrlq $48, %zmm27, %zmm5
+; AVX512-NEXT: vpmovqw %zmm5, %xmm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %xmm18, %xmm5
; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vpsrlq $48, %zmm28, %zmm3
-; AVX512-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,3,1,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vpsrlq $48, %zmm25, %zmm5
+; AVX512-NEXT: vpmovqw %zmm5, %xmm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,3,1,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vpsrlq $48, %zmm27, %zmm2
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vpsrlq $48, %zmm24, %zmm2
; AVX512-NEXT: vpmovqw %zmm2, %xmm2
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %xmm29, %xmm2
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT: vpsrlq $48, %zmm26, %zmm3
+; AVX512-NEXT: vpsrlq $48, %zmm23, %zmm3
; AVX512-NEXT: vpmovqw %zmm3, %xmm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,2,3]
@@ -5933,422 +5938,418 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovaps %zmm2, 64(%rsi)
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512-NEXT: vmovaps %zmm2, (%rsi)
-; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vmovaps %zmm2, 64(%rdx)
-; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vmovaps %zmm2, (%rdx)
-; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rcx)
-; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vmovaps %zmm2, (%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rdx)
+; AVX512-NEXT: vmovdqa64 %zmm21, (%rdx)
+; AVX512-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm20, (%rcx)
; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r8)
; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
-; AVX512-NEXT: addq $200, %rsp
+; AVX512-NEXT: popq %rax
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride4_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22
-; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm25
+; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm25
+; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm22
+; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm27
+; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm23
; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
-; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6]
-; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23
-; AVX512-FCP-NEXT: vpermd %ymm23, %ymm9, %ymm10
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm24
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm26
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24
-; AVX512-FCP-NEXT: vpermd %ymm24, %ymm9, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14]
-; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1
-; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm26
-; AVX512-FCP-NEXT: vpermd %ymm26, %ymm9, %ymm8
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm11
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
-; AVX512-FCP-NEXT: vpermd %ymm27, %ymm9, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm12
-; AVX512-FCP-NEXT: vpermt2d %ymm11, %ymm7, %ymm12
-; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm11[0,1,2,3],zmm1[0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 480(%rdi), %ymm28
-; AVX512-FCP-NEXT: vpermd %ymm28, %ymm9, %ymm11
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %ymm17
-; AVX512-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm12
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm13
-; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm13
-; AVX512-FCP-NEXT: vpmovqw %zmm25, %xmm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm18
-; AVX512-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm28
+; AVX512-FCP-NEXT: vpermd %ymm28, %ymm0, %ymm13
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm5
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [8,10,10,11,8,10,12,14]
+; AVX512-FCP-NEXT: vpermd %zmm26, %zmm11, %zmm4
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm8
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,4,6,12,14]
+; AVX512-FCP-NEXT: vpermt2d %ymm8, %ymm9, %ymm5
+; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm17
+; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm10
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm12
+; AVX512-FCP-NEXT: vpermd %zmm24, %zmm11, %zmm5
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm14
+; AVX512-FCP-NEXT: vpermt2d %ymm14, %ymm9, %ymm12
+; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm12[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %ymm18
+; AVX512-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm8
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm14
+; AVX512-FCP-NEXT: vpermd %zmm23, %zmm11, %zmm12
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm1
+; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm9, %ymm14
+; AVX512-FCP-NEXT: vpmovqw %zmm27, %xmm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm19
-; AVX512-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm13
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm9
-; AVX512-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm9
-; AVX512-FCP-NEXT: vpmovqw %zmm22, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm14[0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm10
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm4, %zmm10
-; AVX512-FCP-NEXT: vpmovqw %zmm10, %xmm10
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm2, %zmm8
-; AVX512-FCP-NEXT: vpmovqw %zmm8, %xmm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm25, %zmm3
-; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm22, %zmm3
-; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm0[0,1,2,3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7]
-; AVX512-FCP-NEXT: vpermd %ymm23, %ymm14, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0
-; AVX512-FCP-NEXT: vpermd %ymm24, %ymm14, %ymm8
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1
-; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0
-; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vpermd %ymm26, %ymm14, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm12
-; AVX512-FCP-NEXT: vpermd %ymm27, %ymm14, %ymm11
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm13
-; AVX512-FCP-NEXT: vpermt2d %ymm12, %ymm7, %ymm13
+; AVX512-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm15
+; AVX512-FCP-NEXT: vpermd %zmm22, %zmm11, %zmm14
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm11
+; AVX512-FCP-NEXT: vpermt2d %ymm11, %ymm9, %ymm15
+; AVX512-FCP-NEXT: vpmovqw %zmm25, %xmm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm11[0,1,2,3],zmm1[0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm3, %zmm4
+; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm2, %zmm5
+; AVX512-FCP-NEXT: vpmovqw %zmm5, %xmm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm4[0,1,2,3],zmm1[0,1,2,3]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm27, %zmm4
+; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm25, %zmm4
+; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm0[0,1,2,3],zmm1[0,1,2,3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
+; AVX512-FCP-NEXT: vpermd %ymm28, %ymm0, %ymm15
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm1
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [9,11,10,11,9,11,13,15]
+; AVX512-FCP-NEXT: vpermd %zmm26, %zmm28, %zmm4
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm5
+; AVX512-FCP-NEXT: vpermt2d %ymm5, %ymm9, %ymm1
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm3, %zmm5
+; AVX512-FCP-NEXT: vpmovqw %zmm5, %xmm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm10
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm8
+; AVX512-FCP-NEXT: vpermd %zmm24, %zmm28, %zmm5
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm12
+; AVX512-FCP-NEXT: vpermt2d %ymm12, %ymm9, %ymm8
; AVX512-FCP-NEXT: vpsrlq $32, %zmm2, %zmm12
; AVX512-FCP-NEXT: vpmovqw %zmm12, %xmm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm12[0,1,2,3],zmm1[0,1,2,3]
-; AVX512-FCP-NEXT: vpermd %ymm28, %ymm14, %ymm12
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm1
-; AVX512-FCP-NEXT: vpermd %ymm17, %ymm14, %ymm13
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm10
-; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm10
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm25, %zmm1
-; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT: vpermd %ymm18, %ymm14, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15
-; AVX512-FCP-NEXT: vpermd %ymm19, %ymm14, %ymm5
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm6
-; AVX512-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm6
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm22, %zmm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm8[0,1,2,3],zmm1[0,1,2,3]
+; AVX512-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm8
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1
+; AVX512-FCP-NEXT: vpermd %zmm23, %zmm28, %zmm14
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm13
+; AVX512-FCP-NEXT: vpermt2d %ymm13, %ymm9, %ymm1
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm27, %zmm13
+; AVX512-FCP-NEXT: vpmovqw %zmm13, %xmm13
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm13
+; AVX512-FCP-NEXT: vpermd %zmm22, %zmm28, %zmm6
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm7
+; AVX512-FCP-NEXT: vpermt2d %ymm7, %ymm9, %ymm13
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm25, %zmm7
; AVX512-FCP-NEXT: vpmovqw %zmm7, %xmm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT: vpsrlq $48, %zmm4, %zmm4
-; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[0,1,2,3]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm7
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpsrlq $48, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
; AVX512-FCP-NEXT: vpsrlq $48, %zmm2, %zmm2
; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[0,1,2,3]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpsrlq $48, %zmm27, %zmm4
+; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpsrlq $48, %zmm25, %zmm4
+; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpsrlq $48, %zmm25, %zmm3
-; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpsrlq $48, %zmm22, %zmm3
-; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3]
; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi)
; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rsi)
; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride4_vf64:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: subq $200, %rsp
-; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm26
-; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm27
-; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm28
-; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm29
+; AVX512DQ-NEXT: pushq %rax
+; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm23
+; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm24
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm25
+; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm27
; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm0
; AVX512DQ-NEXT: vpmovqw %ymm0, %xmm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm14
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm13
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vpmovqw %zmm29, %xmm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1
-; AVX512DQ-NEXT: vpmovqw %ymm1, %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm15
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT: vpmovqw %zmm27, %xmm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm2
+; AVX512DQ-NEXT: vpmovqw %ymm2, %xmm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm12
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm11
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3]
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm13
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm13[0,2,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vpmovqw %zmm28, %xmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vpmovqw %zmm25, %xmm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3]
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm0
; AVX512DQ-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 496(%rdi), %xmm24
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm24[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 480(%rdi), %xmm23
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm23[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vpmovqw %zmm27, %xmm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm1
-; AVX512DQ-NEXT: vpmovqw %ymm1, %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 368(%rdi), %xmm31
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm31[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %xmm25
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm25[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT: vpmovqw %zmm26, %xmm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %xmm30
-; AVX512DQ-NEXT: vmovdqa64 336(%rdi), %xmm17
-; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %xmm18
-; AVX512DQ-NEXT: vmovdqa64 464(%rdi), %xmm19
-; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %xmm20
-; AVX512DQ-NEXT: vmovdqa64 80(%rdi), %xmm21
-; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm1
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vpsrlq $16, %zmm29, %zmm6
-; AVX512DQ-NEXT: vpmovqw %zmm6, %xmm6
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
+; AVX512DQ-NEXT: vmovdqa64 496(%rdi), %xmm30
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm30[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 480(%rdi), %xmm31
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm31[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vpmovqw %zmm24, %xmm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm4
+; AVX512DQ-NEXT: vpmovqw %ymm4, %xmm4
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm9
+; AVX512DQ-NEXT: vmovdqa64 368(%rdi), %xmm28
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm28[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %xmm29
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm29[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
+; AVX512DQ-NEXT: vpmovqw %zmm23, %xmm10
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm9[0,1,2,3],zmm2[0,1,2,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa64 336(%rdi), %xmm16
+; AVX512DQ-NEXT: vmovdqa64 464(%rdi), %xmm17
+; AVX512DQ-NEXT: vmovdqa64 80(%rdi), %xmm18
+; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm2
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm1[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm1
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT: vpsrlq $16, %zmm27, %zmm9
+; AVX512DQ-NEXT: vpmovqw %zmm9, %xmm9
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm21[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm20[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT: vpsrlq $16, %zmm28, %zmm6
-; AVX512DQ-NEXT: vpmovqw %zmm6, %xmm6
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm19[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm18[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vpsrlq $16, %zmm27, %zmm3
-; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm30[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm7
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm18[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-NEXT: vpsrlq $16, %zmm25, %zmm9
+; AVX512DQ-NEXT: vpmovqw %zmm9, %xmm9
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm7[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm17[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 448(%rdi), %xmm0
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-NEXT: vpsrlq $16, %zmm24, %zmm7
+; AVX512DQ-NEXT: vpmovqw %zmm7, %xmm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[0,2,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %xmm26
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm26[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vpsrlq $16, %zmm23, %zmm5
+; AVX512DQ-NEXT: vpmovqw %zmm5, %xmm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm4[0,1,2,3],zmm6[0,1,2,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm19
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,2,0,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT: vpsrlq $16, %zmm26, %zmm4
-; AVX512DQ-NEXT: vpmovqw %zmm4, %xmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,0,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT: vpsrlq $32, %zmm29, %zmm1
-; AVX512DQ-NEXT: vpmovqw %zmm1, %xmm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm21[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm21
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm20[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vpsrlq $32, %zmm28, %zmm8
-; AVX512DQ-NEXT: vpmovqw %zmm8, %xmm8
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm1[0,1,2,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm24[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm23[3,1,2,3]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vpsrlq $32, %zmm27, %zmm2
+; AVX512DQ-NEXT: vpmovqw %zmm2, %xmm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm18[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm18
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-NEXT: vpsrlq $32, %zmm25, %zmm7
+; AVX512DQ-NEXT: vpmovqw %zmm7, %xmm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm3[0,1,2,3],zmm4[0,1,2,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm30[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm31[3,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm18[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm31[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,1,2,0,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm17[3,1,2,3]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm30 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm17[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm17 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm28[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm29[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[0,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm28 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm16[3,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm30[3,1,2,3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm29
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm26[3,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm9[2,0,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
-; AVX512DQ-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14
-; AVX512DQ-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7]
-; AVX512DQ-NEXT: vpsrlq $32, %zmm27, %zmm14
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm30, %ymm0, %ymm14
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7]
+; AVX512DQ-NEXT: vpsrlq $32, %zmm24, %zmm14
; AVX512DQ-NEXT: vpmovqw %zmm14, %xmm14
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm14
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7]
-; AVX512DQ-NEXT: vpsrlq $32, %zmm26, %zmm14
+; AVX512DQ-NEXT: vpsrlq $32, %zmm23, %zmm14
; AVX512DQ-NEXT: vpmovqw %zmm14, %xmm14
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm5[0,1,2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm0[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm0
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT: vpsrlq $48, %zmm29, %zmm3
-; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm3
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm5
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT: vpsrlq $48, %zmm27, %zmm5
+; AVX512DQ-NEXT: vpmovqw %zmm5, %xmm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm5
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vpsrlq $48, %zmm28, %zmm3
-; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT: vpsrlq $48, %zmm25, %zmm5
+; AVX512DQ-NEXT: vpmovqw %zmm5, %xmm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vpsrlq $48, %zmm27, %zmm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT: vpsrlq $48, %zmm24, %zmm2
; AVX512DQ-NEXT: vpmovqw %zmm2, %xmm2
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm2
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT: vpsrlq $48, %zmm26, %zmm3
+; AVX512DQ-NEXT: vpsrlq $48, %zmm23, %zmm3
; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,2,3]
@@ -6356,164 +6357,163 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rsi)
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-NEXT: vmovaps %zmm2, (%rsi)
-; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rdx)
-; AVX512DQ-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%rcx)
-; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vmovaps %zmm2, (%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm16, 64(%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rcx)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%r8)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
-; AVX512DQ-NEXT: addq $200, %rsp
+; AVX512DQ-NEXT: popq %rax
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i16_stride4_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm25
+; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm25
+; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm22
+; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm27
+; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm23
; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
-; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23
-; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm9, %ymm10
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm24
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm26
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24
-; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm9, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14]
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm26
-; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
-; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm11, %ymm7, %ymm12
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm11[0,1,2,3],zmm1[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 480(%rdi), %ymm28
-; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm9, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %ymm17
-; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm12
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm13
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm13
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm25, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm18
-; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm28
+; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm5
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
+; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm11, %zmm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm8
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,4,6,12,14]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm8, %ymm9, %ymm5
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm17
+; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm12
+; AVX512DQ-FCP-NEXT: vpermd %zmm24, %zmm11, %zmm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm14
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm14, %ymm9, %ymm12
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm12[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %ymm18
+; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm14
+; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm11, %zmm12
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm1
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm9, %ymm14
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm27, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm19
-; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm9
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm9
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm22, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm14[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm4, %zmm10
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm10, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm8, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm25, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm0[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm14, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm14, %ymm8
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm14, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm14, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm13
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm12, %ymm7, %ymm13
+; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm15
+; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm11, %zmm14
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm11
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm11, %ymm9, %ymm15
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm25, %xmm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm11[0,1,2,3],zmm1[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm3, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm4[0,1,2,3],zmm1[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm27, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm25, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm0[0,1,2,3],zmm1[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm0, %ymm15
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm1
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm28, %zmm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm5
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm5, %ymm9, %ymm1
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm3, %zmm5
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm8
+; AVX512DQ-FCP-NEXT: vpermd %zmm24, %zmm28, %zmm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm12
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm12, %ymm9, %ymm8
; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm2, %zmm12
; AVX512DQ-FCP-NEXT: vpmovqw %zmm12, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm12[0,1,2,3],zmm1[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm14, %ymm12
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm1
-; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm14, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm10
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm10
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm25, %zmm1
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm14, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15
-; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm14, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm6
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm6
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm22, %zmm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm8[0,1,2,3],zmm1[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1
+; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm28, %zmm14
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm13
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm13, %ymm9, %ymm1
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm27, %zmm13
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm13, %xmm13
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm28, %zmm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm7, %ymm9, %ymm13
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm25, %zmm7
; AVX512DQ-FCP-NEXT: vpmovqw %zmm7, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm7[0,1,2,3],zmm1[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm2, %zmm2
; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm27, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm25, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm25, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3]
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%r8)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 739e6e2369e36..bb502c79f6ff9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -377,85 +377,101 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX2-LABEL: load_i16_stride5_vf4:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-NEXT: vmovq %xmm4, (%rdx)
-; AVX2-NEXT: vmovq %xmm5, (%rcx)
-; AVX2-NEXT: vmovq %xmm6, (%r8)
-; AVX2-NEXT: vmovq %xmm0, (%r9)
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-NEXT: vmovdqa (%rdi), %xmm2
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm2[1,2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm2[2],xmm3[3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7]
+; AVX2-NEXT: vmovq %xmm5, (%rsi)
+; AVX2-NEXT: vmovq %xmm6, (%rdx)
+; AVX2-NEXT: vmovq %xmm7, (%rcx)
+; AVX2-NEXT: vmovq %xmm0, (%r8)
+; AVX2-NEXT: vmovq %xmm1, (%r9)
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i16_stride5_vf4:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-FP-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm4, (%rdx)
-; AVX2-FP-NEXT: vmovq %xmm5, (%rcx)
-; AVX2-FP-NEXT: vmovq %xmm6, (%r8)
-; AVX2-FP-NEXT: vmovq %xmm0, (%r9)
+; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2
+; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm2[1,2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm3[2],xmm7[3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm2[2],xmm3[3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7]
+; AVX2-FP-NEXT: vmovq %xmm5, (%rsi)
+; AVX2-FP-NEXT: vmovq %xmm6, (%rdx)
+; AVX2-FP-NEXT: vmovq %xmm7, (%rcx)
+; AVX2-FP-NEXT: vmovq %xmm0, (%r8)
+; AVX2-FP-NEXT: vmovq %xmm1, (%r9)
+; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i16_stride5_vf4:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx)
-; AVX2-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX2-FCP-NEXT: vmovq %xmm6, (%r8)
-; AVX2-FCP-NEXT: vmovq %xmm0, (%r9)
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2
+; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1,2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,1,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6,7,8,9],ymm0[10],ymm1[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,5,2,7]
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm1
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7]
+; AVX2-FCP-NEXT: vmovq %xmm5, (%rsi)
+; AVX2-FCP-NEXT: vmovq %xmm6, (%rdx)
+; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx)
+; AVX2-FCP-NEXT: vmovq %xmm0, (%r8)
+; AVX2-FCP-NEXT: vmovq %xmm1, (%r9)
+; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i16_stride5_vf4:
@@ -470,27 +486,30 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
; AVX512-NEXT: vpextrw $6, %xmm0, %eax
; AVX512-NEXT: vpextrw $1, %xmm0, %r10d
-; AVX512-NEXT: vmovd %r10d, %xmm4
-; AVX512-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; AVX512-NEXT: vmovd %r10d, %xmm0
+; AVX512-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; AVX512-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1
+; AVX512-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm2, %eax
-; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,1,6,3]
+; AVX512-NEXT: vpermd (%rdi), %zmm1, %zmm1
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa (%rdi), %ymm4
+; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],mem[2],ymm4[3,4,5,6,7,8,9],mem[10],ymm4[11,12,13,14,15]
+; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5,6,7]
; AVX512-NEXT: vmovq %xmm3, (%rsi)
-; AVX512-NEXT: vmovq %xmm1, (%rdx)
-; AVX512-NEXT: vmovq %xmm5, (%rcx)
-; AVX512-NEXT: vmovq %xmm6, (%r8)
-; AVX512-NEXT: vmovq %xmm0, (%r9)
+; AVX512-NEXT: vmovq %xmm0, (%rdx)
+; AVX512-NEXT: vmovq %xmm1, (%rcx)
+; AVX512-NEXT: vmovq %xmm4, (%r8)
+; AVX512-NEXT: vmovq %xmm2, (%r9)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride5_vf4:
@@ -503,25 +522,28 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpextrw $7, %xmm1, %eax
; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
; AVX512-FCP-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; AVX512-FCP-NEXT: vmovd %xmm2, %eax
-; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,1,6,3]
+; AVX512-FCP-NEXT: vpermd (%rdi), %zmm1, %zmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,1,6,7]
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1],mem[2],ymm5[3,4,5,6,7,8,9],mem[10],ymm5[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,5,2,7]
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5,6,7]
; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx)
-; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX512-FCP-NEXT: vmovq %xmm6, (%r8)
-; AVX512-FCP-NEXT: vmovq %xmm0, (%r9)
+; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX512-FCP-NEXT: vmovq %xmm1, (%rcx)
+; AVX512-FCP-NEXT: vmovq %xmm4, (%r8)
+; AVX512-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride5_vf4:
@@ -536,27 +558,30 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
; AVX512DQ-NEXT: vpextrw $6, %xmm0, %eax
; AVX512DQ-NEXT: vpextrw $1, %xmm0, %r10d
-; AVX512DQ-NEXT: vmovd %r10d, %xmm4
-; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT: vmovd %r10d, %xmm0
+; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1
+; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; AVX512DQ-NEXT: vmovd %xmm2, %eax
-; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,1,6,3]
+; AVX512DQ-NEXT: vpermd (%rdi), %zmm1, %zmm1
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],mem[2],ymm4[3,4,5,6,7,8,9],mem[10],ymm4[11,12,13,14,15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5,6,7]
; AVX512DQ-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm1, (%rdx)
-; AVX512DQ-NEXT: vmovq %xmm5, (%rcx)
-; AVX512DQ-NEXT: vmovq %xmm6, (%r8)
-; AVX512DQ-NEXT: vmovq %xmm0, (%r9)
+; AVX512DQ-NEXT: vmovq %xmm0, (%rdx)
+; AVX512DQ-NEXT: vmovq %xmm1, (%rcx)
+; AVX512DQ-NEXT: vmovq %xmm4, (%r8)
+; AVX512DQ-NEXT: vmovq %xmm2, (%r9)
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i16_stride5_vf4:
@@ -569,124 +594,143 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpextrw $7, %xmm1, %eax
; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
; AVX512DQ-FCP-NEXT: vpextrw $3, %xmm1, %eax
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0
; AVX512DQ-FCP-NEXT: vmovd %xmm2, %eax
-; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,1,6,3]
+; AVX512DQ-FCP-NEXT: vpermd (%rdi), %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [4,1,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1],mem[2],ymm5[3,4,5,6,7,8,9],mem[10],ymm5[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,5,2,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8)
-; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9)
+; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%r8)
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i16_stride5_vf4:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512BW-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
+; AVX512BW-NEXT: vpextrw $7, %xmm2, %eax
+; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512BW-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512BW-NEXT: vpextrw $1, %xmm1, %r10d
+; AVX512BW-NEXT: vmovd %r10d, %xmm1
+; AVX512BW-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT: vpextrw $3, %xmm2, %eax
+; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm1, %xmm1
+; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
+; AVX512BW-NEXT: vpermw %zmm0, %zmm2, %zmm2
; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-NEXT: vpermw %zmm0, %zmm4, %zmm4
; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm0, (%rdx)
-; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
+; AVX512BW-NEXT: vpermw %zmm0, %zmm5, %zmm0
+; AVX512BW-NEXT: vmovq %xmm3, (%rsi)
+; AVX512BW-NEXT: vmovq %xmm1, (%rdx)
+; AVX512BW-NEXT: vmovq %xmm2, (%rcx)
; AVX512BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-NEXT: vmovq %xmm1, (%r9)
+; AVX512BW-NEXT: vmovq %xmm0, (%r9)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i16_stride5_vf4:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
+; AVX512BW-FCP-NEXT: vpextrw $7, %xmm2, %eax
+; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512BW-FCP-NEXT: vpextrw $3, %xmm2, %eax
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm1, %xmm1
+; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm2, %zmm2
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm4, %zmm4
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm5, %zmm0
+; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rdx)
+; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rcx)
; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9)
+; AVX512BW-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i16_stride5_vf4:
; AVX512DQ-BW: # %bb.0:
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512DQ-BW-NEXT: vpextrw $5, %xmm1, %eax
+; AVX512DQ-BW-NEXT: vpinsrw $1, %eax, %xmm1, %xmm3
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
+; AVX512DQ-BW-NEXT: vpextrw $7, %xmm2, %eax
+; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vpextrw $6, %xmm1, %eax
+; AVX512DQ-BW-NEXT: vpextrw $1, %xmm1, %r10d
+; AVX512DQ-BW-NEXT: vmovd %r10d, %xmm1
+; AVX512DQ-BW-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT: vpextrw $3, %xmm2, %eax
+; AVX512DQ-BW-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm1, %xmm1
+; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
+; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm2, %zmm2
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm4, %zmm4
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm1
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx)
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm5, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi)
+; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rdx)
+; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rcx)
; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9)
+; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax
-; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm2, %eax
+; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpextrw $3, %xmm2, %eax
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm2, %zmm2
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm4, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm5, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%r9)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <20 x i16>, ptr %in.vec, align 64
@@ -984,9 +1028,9 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm7
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [4,1,6,3]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX2-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm8
; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
@@ -1031,7 +1075,8 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15]
; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7
; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,10,11,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-NEXT: vprolq $16, %xmm6, %xmm6
; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,3]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6]
; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
@@ -1074,12 +1119,12 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm7
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,1,6,3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX512-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,10,11,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vprolq $16, %xmm6, %xmm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
@@ -1123,7 +1168,8 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15]
; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,10,11,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vprolq $16, %xmm6, %xmm6
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
@@ -1166,12 +1212,12 @@ define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,1,6,3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,1,10,11,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vprolq $16, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
@@ -1513,50 +1559,50 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa 96(%rdi), %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,7]
-; AVX-NEXT: vmovdqa 112(%rdi), %xmm1
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm2[1],xmm1[1]
-; AVX-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
+; AVX-NEXT: vmovdqa 112(%rdi), %xmm2
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; AVX-NEXT: vmovdqa 80(%rdi), %xmm4
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7]
; AVX-NEXT: vmovdqa 144(%rdi), %xmm8
; AVX-NEXT: vmovdqa 128(%rdi), %xmm7
-; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4],xmm4[5,6,7]
-; AVX-NEXT: vmovdqa (%rdi), %xmm3
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm4
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0,1,2,3,4],xmm3[5,6,7]
+; AVX-NEXT: vmovdqa (%rdi), %xmm1
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX-NEXT: vmovdqa 32(%rdi), %xmm5
; AVX-NEXT: vmovdqa 48(%rdi), %xmm6
-; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,0,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm5[4],xmm9[5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[3,1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,1,0,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm5[4],xmm10[5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[3,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
-; AVX-NEXT: vandps %ymm11, %ymm9, %ymm12
-; AVX-NEXT: vmovaps 64(%rdi), %xmm9
-; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm9[0,1,0,1]
+; AVX-NEXT: vandps %ymm11, %ymm10, %ymm12
+; AVX-NEXT: vmovaps 64(%rdi), %xmm10
+; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm10[0,1,0,1]
; AVX-NEXT: vandnps %ymm13, %ymm11, %ymm13
; AVX-NEXT: vorps %ymm13, %ymm12, %ymm12
-; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm12, %ymm9
+; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1],xmm0[2,3],xmm2[4,5,6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,0,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5]
-; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,3,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[0,3,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7]
; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
-; AVX-NEXT: vpsllq $48, %xmm9, %xmm13
+; AVX-NEXT: vpsllq $48, %xmm10, %xmm13
; AVX-NEXT: vandnps %ymm13, %ymm11, %ymm13
-; AVX-NEXT: vpsrlq $48, %xmm4, %xmm14
-; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,3,2,3]
+; AVX-NEXT: vpsrlq $48, %xmm3, %xmm14
+; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,3,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,2,2,3,4,5,6,7]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7]
@@ -1565,38 +1611,38 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vandps %ymm11, %ymm14, %ymm11
; AVX-NEXT: vorps %ymm13, %ymm11, %ymm11
; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm2[0,1,2,3],xmm0[4,5],xmm2[6,7]
; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,8,9,2,3,12,13,12,13,u,u,u,u]
-; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[3,1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[3,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,1,1,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[0,1,1,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,7]
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3]
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm3[2],xmm13[3],xmm3[3]
; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u]
; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5],xmm13[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,2,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,1,2,0]
; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5]
; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7]
; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,u,u,u,u,u,u]
-; AVX-NEXT: vpsrlq $48, %xmm2, %xmm14
+; AVX-NEXT: vpsrlq $48, %xmm4, %xmm14
; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1,2,3],xmm7[4,5],xmm8[6,7]
; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm1[2,3],xmm3[4,5,6,7]
; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm10[0,1,0,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
@@ -1606,22 +1652,23 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,1,3]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT: vmovaps %ymm10, (%rsi)
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm7[4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,1,1,3]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5],xmm3[6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,12,13,14,15]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5],xmm1[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovaps %ymm9, (%rsi)
; AVX-NEXT: vmovaps %ymm11, (%rdx)
; AVX-NEXT: vmovaps %ymm12, (%rcx)
; AVX-NEXT: vmovaps %ymm13, (%r8)
@@ -1643,11 +1690,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7]
; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5
-; AVX2-NEXT: vmovdqa 144(%rdi), %xmm6
-; AVX2-NEXT: vmovdqa 128(%rdi), %xmm4
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0],xmm6[1],xmm4[2,3]
+; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0]
+; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm5
+; AVX2-NEXT: vmovdqa 128(%rdi), %xmm7
+; AVX2-NEXT: vmovdqa 144(%rdi), %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0],xmm4[1],xmm7[2,3]
; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15]
@@ -1660,12 +1707,12 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3]
+; AVX2-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6
+; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm4[2],xmm7[3]
; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
@@ -1675,7 +1722,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm7[1],xmm4[2,3]
; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
@@ -1689,11 +1736,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
-; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm7[2],xmm4[3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
@@ -1703,17 +1750,17 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-NEXT: vmovdqa 128(%rdi), %xmm1
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-NEXT: vmovdqa %ymm5, (%rsi)
-; AVX2-NEXT: vmovdqa %ymm7, (%rdx)
+; AVX2-NEXT: vmovdqa %ymm6, (%rdx)
; AVX2-NEXT: vmovdqa %ymm8, (%rcx)
-; AVX2-NEXT: vmovdqa %ymm9, (%r8)
+; AVX2-NEXT: vmovdqa %ymm7, (%r8)
; AVX2-NEXT: vmovdqa %ymm0, (%r9)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1732,15 +1779,15 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm6
-; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm4
-; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm5
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0],xmm4[1],xmm5[2,3]
+; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0]
+; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
+; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm7
+; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm5
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0],xmm5[1],xmm7[2,3]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2,3,4],ymm8[5,6,7],ymm4[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
@@ -1749,12 +1796,12 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2],xmm5[3]
+; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm8, %ymm9, %ymm6
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm5[2],xmm7[3]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
@@ -1764,7 +1811,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm5[1],xmm4[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0],xmm7[1],xmm5[2,3]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
@@ -1778,11 +1825,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm4[0,1],xmm5[2],xmm4[3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0,1],xmm7[2],xmm5[3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
@@ -1792,15 +1839,16 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12,13,14,15,4,5,14,15,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[0,1,2,3,0,1,10,11,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[12,13,14,15,4,5,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm2
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FP-NEXT: vmovdqa %ymm6, (%rsi)
-; AVX2-FP-NEXT: vmovdqa %ymm7, (%rdx)
+; AVX2-FP-NEXT: vmovdqa %ymm4, (%rsi)
+; AVX2-FP-NEXT: vmovdqa %ymm6, (%rdx)
; AVX2-FP-NEXT: vmovdqa %ymm8, (%rcx)
-; AVX2-FP-NEXT: vmovdqa %ymm9, (%r8)
+; AVX2-FP-NEXT: vmovdqa %ymm7, (%r8)
; AVX2-FP-NEXT: vmovdqa %ymm0, (%r9)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
@@ -1858,9 +1906,9 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm7
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,1,6,3]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
+; AVX2-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm9
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7]
@@ -1909,27 +1957,27 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u]
; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm5
-; AVX512-NEXT: vmovdqa 144(%rdi), %xmm6
-; AVX512-NEXT: vmovdqa 128(%rdi), %xmm4
-; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vmovdqa 128(%rdi), %xmm7
+; AVX512-NEXT: vmovdqa 144(%rdi), %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm4[1],xmm7[2,3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u]
-; AVX512-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3]
+; AVX512-NEXT: vpor %ymm6, %ymm8, %ymm6
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm4[2],xmm7[3]
; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
@@ -1939,7 +1987,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm7[1],xmm4[2,3]
; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
@@ -1953,11 +2001,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
-; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm7[2],xmm4[3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
@@ -1967,17 +2015,17 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT: vmovdqa 128(%rdi), %xmm1
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512-NEXT: vmovdqa %ymm5, (%rsi)
-; AVX512-NEXT: vmovdqa %ymm7, (%rdx)
+; AVX512-NEXT: vmovdqa %ymm6, (%rdx)
; AVX512-NEXT: vmovdqa %ymm8, (%rcx)
-; AVX512-NEXT: vmovdqa %ymm9, (%r8)
+; AVX512-NEXT: vmovdqa %ymm7, (%r8)
; AVX512-NEXT: vmovdqa %ymm0, (%r9)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -2034,9 +2082,9 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm7
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,1,6,3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
+; AVX512-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm9
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7]
@@ -2060,14 +2108,13 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,5,7]
; AVX512-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14]
-; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rsi)
; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rdx)
; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rcx)
; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r8)
-; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512-FCP-NEXT: vmovdqa %ymm0, (%r9)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -2086,27 +2133,27 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm5
-; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm6
-; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm7
+; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm4[1],xmm7[2,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3]
+; AVX512DQ-NEXT: vpor %ymm6, %ymm8, %ymm6
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm4[2],xmm7[3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9]
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7]
@@ -2116,7 +2163,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm7[1],xmm4[2,3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11]
; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
@@ -2130,11 +2177,11 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0,1],xmm7[2],xmm4[3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
@@ -2144,17 +2191,17 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm1
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-NEXT: vmovdqa %ymm5, (%rsi)
-; AVX512DQ-NEXT: vmovdqa %ymm7, (%rdx)
+; AVX512DQ-NEXT: vmovdqa %ymm6, (%rdx)
; AVX512DQ-NEXT: vmovdqa %ymm8, (%rcx)
-; AVX512DQ-NEXT: vmovdqa %ymm9, (%r8)
+; AVX512DQ-NEXT: vmovdqa %ymm7, (%r8)
; AVX512DQ-NEXT: vmovdqa %ymm0, (%r9)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -2211,9 +2258,9 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm7
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,1,6,3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm9
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7]
@@ -2237,14 +2284,13 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,5,7]
; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -2887,18 +2933,19 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-LABEL: load_i16_stride5_vf32:
; AVX: # %bb.0:
; AVX-NEXT: subq $424, %rsp # imm = 0x1A8
-; AVX-NEXT: vmovdqa 144(%rdi), %xmm9
-; AVX-NEXT: vmovdqa 128(%rdi), %xmm7
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7]
+; AVX-NEXT: vmovdqa 144(%rdi), %xmm11
+; AVX-NEXT: vmovdqa 128(%rdi), %xmm9
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm11[2,3],xmm9[4,5,6,7]
; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa 96(%rdi), %xmm11
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,1,1,3]
+; AVX-NEXT: vmovdqa 96(%rdi), %xmm7
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,1,3]
+; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
-; AVX-NEXT: vmovdqa 112(%rdi), %xmm10
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1]
-; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 112(%rdi), %xmm8
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm8[1]
+; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
@@ -2907,40 +2954,38 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
; AVX-NEXT: vmovdqa (%rdi), %xmm5
; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm12
-; AVX-NEXT: vmovdqa 32(%rdi), %xmm3
-; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm15
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,1,0,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3]
-; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm13
+; AVX-NEXT: vmovdqa 32(%rdi), %xmm14
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm10
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,1,0,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[3,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
-; AVX-NEXT: vandps %ymm6, %ymm3, %ymm3
-; AVX-NEXT: vmovaps 64(%rdi), %xmm5
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1,0,1]
-; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vandnps %ymm4, %ymm6, %ymm4
+; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
+; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3
+; AVX-NEXT: vmovaps 64(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1]
+; AVX-NEXT: vandnps %ymm4, %ymm5, %ymm4
; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 304(%rdi), %xmm2
-; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa 288(%rdi), %xmm13
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3],xmm13[4,5,6,7]
-; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 304(%rdi), %xmm12
+; AVX-NEXT: vmovdqa 288(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm12[2,3],xmm0[4,5,6,7]
+; AVX-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill
; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX-NEXT: vmovdqa 256(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
; AVX-NEXT: vmovdqa 272(%rdi), %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
; AVX-NEXT: vmovdqa 240(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2960,703 +3005,714 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovdqa 208(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX-NEXT: vmovdqa 192(%rdi), %xmm14
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7]
-; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 192(%rdi), %xmm15
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm15[4],xmm0[5,6,7]
+; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vandps %ymm6, %ymm0, %ymm0
+; AVX-NEXT: vandps %ymm5, %ymm0, %ymm0
; AVX-NEXT: vmovaps 224(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1]
-; AVX-NEXT: vandnps %ymm8, %ymm6, %ymm8
-; AVX-NEXT: vorps %ymm0, %ymm8, %ymm0
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm1[0,1,0,1]
+; AVX-NEXT: vandnps %ymm6, %ymm5, %ymm6
+; AVX-NEXT: vorps %ymm6, %ymm0, %ymm0
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa %xmm11, %xmm6
-; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,3,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm9[4,5],xmm7[6,7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX-NEXT: vmovdqa %xmm11, %xmm5
+; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm11[4,5],xmm9[6,7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,3,2,3]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
-; AVX-NEXT: vpsrlq $48, %xmm12, %xmm9
+; AVX-NEXT: vmovdqa %xmm13, %xmm4
+; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpsrlq $48, %xmm13, %xmm9
; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1]
-; AVX-NEXT: vmovdqa %xmm15, %xmm12
-; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm15[2,3],xmm2[4,5],xmm15[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0,1],xmm10[2,3],xmm14[4,5],xmm10[6,7]
+; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7]
; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm9
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5,6,7]
-; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
-; AVX-NEXT: vandps %ymm3, %ymm10, %ymm3
-; AVX-NEXT: vpsllq $48, %xmm5, %xmm9
-; AVX-NEXT: vandnps %ymm9, %ymm10, %ymm9
+; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
+; AVX-NEXT: vandps %ymm3, %ymm11, %ymm3
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vpsllq $48, %xmm8, %xmm9
+; AVX-NEXT: vandnps %ymm9, %ymm11, %ymm9
; AVX-NEXT: vorps %ymm3, %ymm9, %ymm3
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm5[4,5],xmm13[6,7]
-; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload
-; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm12[4,5],xmm13[6,7]
+; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,0,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,3,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm11[0,3,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,2,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm13[2,3],xmm14[4,5],xmm13[6,7]
+; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = xmm15[0,1],mem[2,3],xmm15[4,5],mem[6,7]
; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1
; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
; AVX-NEXT: # xmm3 = mem[0,3,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vpsrlq $48, %xmm8, %xmm8
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT: vpsrlq $48, %xmm6, %xmm6
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6,7]
-; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
-; AVX-NEXT: vandps %ymm1, %ymm8, %ymm1
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT: vpsllq $48, %xmm14, %xmm3
-; AVX-NEXT: vandnps %ymm3, %ymm8, %ymm3
+; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535]
+; AVX-NEXT: vandps %ymm6, %ymm1, %ymm1
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT: vpsllq $48, %xmm15, %xmm3
+; AVX-NEXT: vandnps %ymm3, %ymm6, %ymm3
; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm6[4,5],mem[6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13]
; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5,6,7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = xmm5[0,1],mem[2,3],xmm5[4,5,6,7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,1,1,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm10[2,3],xmm14[4,5,6,7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15]
+; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,1,1,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm4[2],xmm9[3],xmm4[3]
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3,4,5],xmm9[6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,1,2,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,1,2,0]
; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,5]
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT: vmovdqa %xmm5, %xmm9
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7]
-; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm4
+; AVX-NEXT: vmovdqa %xmm13, %xmm2
+; AVX-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm13[2,3],xmm9[4,5,6,7]
+; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm6
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT: vmovdqa %xmm15, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm5[4,5],xmm15[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3],xmm5[4,5],xmm12[6,7]
; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX-NEXT: vmovdqa %xmm10, %xmm2
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm4[5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7]
-; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,1,3]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4],xmm6[5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1],xmm13[2,3],xmm14[4,5,6,7]
+; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,1,3]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm7[2],xmm3[3],xmm7[3]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5],xmm3[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,2,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,1,2,0]
; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm12[2,3],xmm5[4,5],xmm12[6,7]
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15]
; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm0
-; AVX-NEXT: vpsrlq $48, %xmm2, %xmm3
+; AVX-NEXT: vpsrlq $48, %xmm11, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm6[4,5],xmm9[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm2[4,5],xmm9[6,7]
; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm4[2,3],xmm8[4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7]
; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm13[4,5],xmm15[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm14[0,1,2,3],xmm13[4,5],xmm14[6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5],xmm3[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,1,0,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[0,1,0,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6]
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7]
-; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2,3],xmm12[4,5],xmm11[6,7]
+; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm7
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpsrlq $48, %xmm2, %xmm3
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vpsrlq $48, %xmm0, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4],xmm0[5,6,7]
-; AVX-NEXT: vmovdqa %xmm11, %xmm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm7[5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm7[2,3],xmm4[4,5,6,7]
; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm11[4,5],xmm6[6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm6[4,5],xmm9[6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5],xmm1[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,3]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[0,1,0,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm10[4,5],xmm0[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,1,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm10
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3],xmm1[4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm7[4,5],xmm4[6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,1,1,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5],xmm3[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,1,1,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[0,1,1,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm5
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[3,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm6
+; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = mem[0,3,2,3]
+; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm1[0,1,2],mem[3],xmm1[4,5,6,7]
+; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = mem[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
+; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[1,1,1,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5],xmm3[6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,1,1,3]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT: vpshufd $231, (%rsp), %xmm3 # 16-byte Folded Reload
; AVX-NEXT: # xmm3 = mem[3,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
-; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = mem[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = mem[0,3,2,3]
-; AVX-NEXT: vpblendw $8, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7]
-; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX-NEXT: # xmm9 = mem[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
-; AVX-NEXT: vpshufb %xmm8, %xmm5, %xmm5
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,1,3]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm3, 32(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm3, (%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm3, 32(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm3, (%rdx)
+; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm2, 32(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm2, (%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm2, (%rdx)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, 32(%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm0, (%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm0, 32(%r8)
-; AVX-NEXT: vmovaps %ymm7, (%r8)
-; AVX-NEXT: vmovaps %ymm2, 32(%r9)
-; AVX-NEXT: vmovaps %ymm1, (%r9)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm2, (%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm2, 32(%r8)
+; AVX-NEXT: vmovaps %ymm10, (%r8)
+; AVX-NEXT: vmovaps %ymm1, 32(%r9)
+; AVX-NEXT: vmovaps %ymm6, (%r9)
; AVX-NEXT: addq $424, %rsp # imm = 0x1A8
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: load_i16_stride5_vf32:
; AVX2: # %bb.0:
-; AVX2-NEXT: subq $264, %rsp # imm = 0x108
+; AVX2-NEXT: subq $232, %rsp
; AVX2-NEXT: vmovdqa (%rdi), %ymm1
; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX2-NEXT: vmovdqa 64(%rdi), %ymm3
-; AVX2-NEXT: vmovdqa 96(%rdi), %ymm15
-; AVX2-NEXT: vmovdqa 192(%rdi), %ymm4
-; AVX2-NEXT: vmovdqa 160(%rdi), %ymm5
-; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7
+; AVX2-NEXT: vmovdqa 64(%rdi), %ymm15
+; AVX2-NEXT: vmovdqa 96(%rdi), %ymm7
+; AVX2-NEXT: vmovdqa 192(%rdi), %ymm3
+; AVX2-NEXT: vmovdqa 160(%rdi), %ymm4
+; AVX2-NEXT: vmovdqa 224(%rdi), %ymm5
; AVX2-NEXT: vmovdqa 256(%rdi), %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6],ymm8[7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6],ymm9[7]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
+; AVX2-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3],xmm9[4,5],xmm10[6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX2-NEXT: vpshufb %xmm12, %xmm9, %xmm9
+; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0]
+; AVX2-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm7[1,2],ymm15[3],ymm7[4],ymm15[5],ymm7[6,7],ymm15[8],ymm7[9,10],ymm15[11],ymm7[12],ymm15[13],ymm7[14,15]
+; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6],ymm13[7]
+; AVX2-NEXT: vpshufb %ymm11, %ymm0, %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+; AVX2-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1,2,3],xmm11[4,5],xmm13[6,7]
+; AVX2-NEXT: vpshufb %xmm12, %xmm11, %xmm11
+; AVX2-NEXT: vpblendvb %ymm10, %ymm11, %ymm0, %ymm11
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
+; AVX2-NEXT: vmovdqa %ymm5, %ymm8
; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX2-NEXT: vpshufb %xmm11, %xmm8, %xmm8
-; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm15[1,2],ymm3[3],ymm15[4],ymm3[5],ymm15[6,7],ymm3[8],ymm15[9,10],ymm3[11],ymm15[12],ymm3[13],ymm15[14,15]
; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6],ymm12[7]
-; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
-; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm12
-; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1,2,3],xmm10[4,5],xmm12[6,7]
-; AVX2-NEXT: vpshufb %xmm11, %xmm10, %xmm10
-; AVX2-NEXT: vpblendvb %ymm9, %ymm10, %ymm0, %ymm12
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
-; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm13
-; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
+; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX2-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-NEXT: vpshufb %xmm14, %xmm11, %xmm11
-; AVX2-NEXT: vpblendvb %ymm9, %ymm11, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15]
-; AVX2-NEXT: vmovdqa %ymm15, %ymm5
-; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
-; AVX2-NEXT: vpshufb %ymm10, %ymm11, %ymm10
-; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX2-NEXT: vmovdqa %ymm2, %ymm15
-; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm13
-; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7]
-; AVX2-NEXT: vmovdqa 304(%rdi), %xmm13
-; AVX2-NEXT: vpshufb %xmm14, %xmm11, %xmm11
-; AVX2-NEXT: vmovdqa 288(%rdi), %xmm14
-; AVX2-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm9
-; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0],xmm13[1],xmm14[2,3]
+; AVX2-NEXT: vpshufb %xmm14, %xmm13, %xmm13
+; AVX2-NEXT: vpblendvb %ymm10, %ymm13, %ymm0, %ymm13
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5],ymm15[6],ymm7[7,8],ymm15[9],ymm7[10,11],ymm15[12],ymm7[13],ymm15[14],ymm7[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
+; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm15
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX2-NEXT: vmovdqa %ymm1, %ymm5
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm12
+; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3],xmm0[4,5,6],xmm12[7]
+; AVX2-NEXT: vmovdqa 288(%rdi), %xmm0
+; AVX2-NEXT: vpshufb %xmm14, %xmm12, %xmm14
+; AVX2-NEXT: vmovdqa 304(%rdi), %xmm12
+; AVX2-NEXT: vpblendvb %ymm10, %ymm14, %ymm15, %ymm14
+; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm0[0],xmm12[1],xmm0[2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
; AVX2-NEXT: vpshufb %xmm1, %xmm10, %xmm10
; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm10[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 144(%rdi), %xmm11
-; AVX2-NEXT: vmovdqa 128(%rdi), %xmm10
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm11[1],xmm10[2,3]
-; AVX2-NEXT: vpshufb %xmm1, %xmm8, %xmm1
+; AVX2-NEXT: vmovdqa 128(%rdi), %xmm15
+; AVX2-NEXT: vmovdqa 144(%rdi), %xmm10
+; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0],xmm10[1],xmm15[2,3]
+; AVX2-NEXT: vpshufb %xmm1, %xmm9, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5,6,7],ymm12[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm13[2],xmm14[3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm11[2],xmm10[3]
-; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
-; AVX2-NEXT: vmovdqa %ymm6, %ymm9
-; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm2[2],ymm4[3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11],ymm2[12],ymm4[13,14],ymm2[15]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm8
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15]
-; AVX2-NEXT: vmovdqa %ymm5, %ymm6
-; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm8
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1],xmm12[2],xmm0[3]
+; AVX2-NEXT: vmovdqa %xmm0, %xmm2
+; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15]
-; AVX2-NEXT: vmovdqa %ymm4, %ymm7
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm8
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2],xmm1[3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm14[2],xmm13[3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
-; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm10[2],xmm15[3]
+; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4],ymm0[5],ymm9[6],ymm0[7]
-; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4],ymm5[5],ymm15[6,7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12],ymm5[13],ymm15[14,15]
-; AVX2-NEXT: vmovdqa %ymm5, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm12
-; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3]
-; AVX2-NEXT: vpshufb %xmm8, %xmm9, %xmm8
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm10[2],xmm11[3]
-; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0,1,2,3,4],ymm1[5,6,7],ymm14[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm6[1],ymm8[2,3],ymm6[4],ymm8[5],ymm6[6],ymm8[7,8],ymm6[9],ymm8[10,11],ymm6[12],ymm8[13],ymm6[14],ymm8[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm1[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6],ymm1[7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15]
+; AVX2-NEXT: vmovdqa %ymm4, %ymm0
+; AVX2-NEXT: vmovdqa %ymm3, %ymm7
+; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm14
+; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm14[3,4],xmm9[5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX2-NEXT: vpshufb %ymm14, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX2-NEXT: vpshufb %xmm3, %xmm9, %xmm9
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm2[1],xmm12[2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm9
+; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm9[5,6,7],ymm1[8,9,10,11,12],ymm9[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6],ymm9[7]
+; AVX2-NEXT: vpshufb %ymm14, %ymm9, %ymm9
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1],ymm5[2],ymm1[3],ymm5[4],ymm1[5,6],ymm5[7],ymm1[8,9],ymm5[10],ymm1[11],ymm5[12],ymm1[13,14],ymm5[15]
+; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm14
+; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm14[3,4],xmm11[5,6,7]
+; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm15[1],xmm10[2,3]
+; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7]
-; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm4 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
-; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm3
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4],ymm0[5],ymm7[6,7],ymm0[8],ymm7[9,10],ymm0[11],ymm7[12],ymm0[13],ymm7[14,15]
+; AVX2-NEXT: vmovdqa %ymm7, %ymm9
+; AVX2-NEXT: vmovdqa %ymm0, %ymm7
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm11
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2],xmm3[3]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX2-NEXT: vpshufb %ymm11, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX2-NEXT: vpshufb %xmm14, %xmm3, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm0 = xmm12[0,1],mem[2],xmm12[3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm8[1],ymm13[2],ymm8[3],ymm13[4,5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10],ymm8[11],ymm13[12,13],ymm8[14],ymm13[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm2[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4],ymm2[5],ymm13[6],ymm2[7]
+; AVX2-NEXT: vpshufb %ymm11, %ymm2, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm1[1,2],ymm4[3],ymm1[4],ymm4[5],ymm1[6,7],ymm4[8],ymm1[9,10],ymm4[11],ymm1[12],ymm4[13],ymm1[14,15]
+; AVX2-NEXT: vmovdqa %ymm4, %ymm0
+; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3]
+; AVX2-NEXT: vpshufb %xmm14, %xmm11, %xmm11
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm15[2],xmm10[3]
+; AVX2-NEXT: vpshufb %xmm3, %xmm11, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-NEXT: vmovdqa 288(%rdi), %xmm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,1,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX2-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm3 = mem[0,1],ymm8[2],mem[3],ymm8[4],mem[5,6],ymm8[7],mem[8,9],ymm8[10],mem[11],ymm8[12],mem[13,14],ymm8[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4],ymm3[5,6],ymm7[7]
+; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3,4],xmm7[5,6,7]
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX2-NEXT: vmovdqa 128(%rdi), %xmm4
+; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm4, 32(%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm1, (%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm1, (%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm1, (%rcx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
-; AVX2-NEXT: vmovdqa %ymm0, (%r8)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 32(%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, (%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 32(%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, (%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 32(%rcx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, (%rcx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 32(%r8)
+; AVX2-NEXT: vmovdqa %ymm15, (%r8)
; AVX2-NEXT: vmovdqa %ymm2, 32(%r9)
; AVX2-NEXT: vmovdqa %ymm3, (%r9)
-; AVX2-NEXT: addq $264, %rsp # imm = 0x108
+; AVX2-NEXT: addq $232, %rsp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i16_stride5_vf32:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: subq $264, %rsp # imm = 0x108
-; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm13
+; AVX2-FP-NEXT: subq $280, %rsp # imm = 0x118
+; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm5
; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6
; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm7
-; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm10
-; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm14
-; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm3
-; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm4
-; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm5
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
-; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm14[1],ymm3[2,3],ymm14[4],ymm3[5],ymm14[6],ymm3[7,8],ymm14[9],ymm3[10,11],ymm14[12],ymm3[13],ymm14[14],ymm3[15]
+; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm9
+; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm1
+; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm2
+; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm3
+; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm4
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm8, %xmm8
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6],ymm8[7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2,3],xmm10[4,5],xmm11[6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm10, %xmm10
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm10[1,2],ymm7[3],ymm10[4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10],ymm7[11],ymm10[12],ymm7[13],ymm10[14,15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5],ymm8[6],ymm12[7]
-; AVX2-FP-NEXT: vpshufb %ymm1, %ymm8, %ymm1
+; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm10, %ymm0, %ymm10
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6],ymm13[7]
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5],ymm6[6],ymm13[7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13],ymm6[14],ymm13[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm8, %xmm8
-; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm1, %ymm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm13
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm13[1,2,3],xmm8[4,5],xmm13[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm8, %xmm8
+; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm0, %ymm12
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm3[1],ymm14[2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10],ymm3[11],ymm14[12,13],ymm3[14],ymm14[15]
-; AVX2-FP-NEXT: vmovdqa %ymm14, %ymm5
-; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm14
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3],xmm9[4,5,6],xmm14[7]
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6],xmm14[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-FP-NEXT: vpshufb %xmm14, %xmm9, %xmm9
-; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm9, %ymm1, %ymm1
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5],ymm7[6],ymm10[7,8],ymm7[9],ymm10[10,11],ymm7[12],ymm10[13],ymm7[14],ymm10[15]
-; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm9[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7]
-; AVX2-FP-NEXT: vpshufb %ymm8, %ymm9, %ymm15
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7]
-; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm8
-; AVX2-FP-NEXT: vpshufb %xmm14, %xmm9, %xmm14
-; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm9
+; AVX2-FP-NEXT: vpshufb %xmm14, %xmm13, %xmm13
+; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm13, %ymm0, %ymm13
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm0, %ymm15
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm8
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6],xmm8[7]
+; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm0
+; AVX2-FP-NEXT: vpshufb %xmm14, %xmm8, %xmm14
+; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm3
; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm11
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm9[0],xmm8[1],xmm9[2,3]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm14, %xmm14
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm0[0],xmm3[1],xmm0[2,3]
+; AVX2-FP-NEXT: vmovdqa %xmm0, %xmm8
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm14
; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7],ymm0[8,9,10,11,12],ymm14[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0,1,2,3,4],ymm14[5,6,7],ymm10[8,9,10,11,12],ymm14[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm14[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm6
; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm15
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3]
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm14, %xmm2
-; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm14
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm15[0],xmm14[1],xmm15[2,3]
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm10, %xmm1
+; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5,6,7],ymm12[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm8[2],xmm9[3]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
-; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm3[2],xmm8[3]
+; AVX2-FP-NEXT: vmovdqa %xmm3, %xmm6
+; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm6[2],xmm15[3]
-; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm14[2],xmm15[3]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
-; AVX2-FP-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm11
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7]
+; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm0
+; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm1[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6],ymm1[7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1],ymm2[2],ymm4[3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11],ymm2[12],ymm4[13,14],ymm2[15]
+; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm5
+; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm9[1],xmm8[2,3]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm2
-; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10,11],ymm10[12],ymm7[13],ymm10[14],ymm7[15]
-; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm10
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
-; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3],ymm13[4],ymm7[5,6],ymm13[7],ymm7[8,9],ymm13[10],ymm7[11],ymm13[12],ymm7[13,14],ymm13[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm11
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm15[1],xmm6[2,3]
-; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm2
-; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb %xmm13, %xmm10, %xmm10
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0],xmm8[1],xmm6[2,3]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm10, %xmm10
+; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm11
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2],xmm2[3]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6],ymm10[7]
+; AVX2-FP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1],ymm1[2],ymm6[3],ymm1[4],ymm6[5,6],ymm1[7],ymm6[8,9],ymm1[10],ymm6[11],ymm1[12],ymm6[13,14],ymm1[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3,4],xmm11[5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm13, %xmm11, %xmm11
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm15[1],xmm14[2,3]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm11, %xmm2
; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm5
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm5[1],ymm10[2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5],ymm14[6],ymm1[7]
-; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
-; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm4
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm7[1,2],ymm13[3],ymm7[4],ymm13[5],ymm7[6,7],ymm13[8],ymm7[9,10],ymm13[11],ymm7[12],ymm13[13],ymm7[14,15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm14
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3]
-; AVX2-FP-NEXT: vpshufb %xmm12, %xmm11, %xmm11
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm6[0,1],xmm15[2],xmm6[3]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm11, %xmm3
-; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm11, %xmm6, %xmm3
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm15, %xmm12
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5],ymm13[6],ymm4[7,8],ymm13[9],ymm4[10,11],ymm13[12],ymm4[13],ymm13[14],ymm4[15]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7],ymm10[8,9,10,11,12],ymm2[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm2[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4],ymm2[5],ymm10[6],ymm2[7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4],ymm5[5],ymm4[6,7],ymm5[8],ymm4[9,10],ymm5[11],ymm4[12],ymm5[13],ymm4[14,15]
+; AVX2-FP-NEXT: vmovdqa %ymm4, %ymm11
+; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm12
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb %xmm13, %xmm10, %xmm10
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm8[2],xmm10[3]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7]
+; AVX2-FP-NEXT: vpshufb %ymm12, %ymm0, %ymm0
+; AVX2-FP-NEXT: vmovdqa %ymm1, %ymm3
+; AVX2-FP-NEXT: vmovdqa %ymm6, %ymm4
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm6[1,2],ymm1[3],ymm6[4],ymm1[5],ymm6[6,7],ymm1[8],ymm6[9,10],ymm1[11],ymm6[12],ymm1[13],ymm6[14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm12
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0],xmm2[1],xmm12[2],xmm2[3]
+; AVX2-FP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm15[2],xmm14[3]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm1
+; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm1
+; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm2
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1],ymm7[2],ymm9[3],ymm7[4],ymm9[5,6],ymm7[7],ymm9[8,9],ymm7[10],ymm9[11],ymm7[12],ymm9[13,14],ymm7[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX2-FP-NEXT: vpshufb %ymm5, %ymm6, %ymm6
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm4, %xmm4
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6],ymm6[7]
-; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
-; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm11, %xmm8, %xmm5
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm0
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm3 = ymm11[0],mem[1],ymm11[2,3],mem[4],ymm11[5],mem[6],ymm11[7,8],mem[9],ymm11[10,11],mem[12],ymm11[13],mem[14],ymm11[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm10, %xmm0
+; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm3
+; AVX2-FP-NEXT: vpshufb %xmm14, %xmm3, %xmm3
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx)
-; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%r8)
-; AVX2-FP-NEXT: vmovdqa %ymm1, (%r8)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm2, (%rdx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rcx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx)
+; AVX2-FP-NEXT: vmovdqa %ymm8, 32(%r8)
+; AVX2-FP-NEXT: vmovdqa %ymm15, (%r8)
; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm3, (%r9)
-; AVX2-FP-NEXT: addq $264, %rsp # imm = 0x108
+; AVX2-FP-NEXT: vmovdqa %ymm1, (%r9)
+; AVX2-FP-NEXT: addq $280, %rsp # imm = 0x118
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i16_stride5_vf32:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: subq $296, %rsp # imm = 0x128
+; AVX2-FCP-NEXT: subq $264, %rsp # imm = 0x108
; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm15
; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3
-; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm14
+; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm4
; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15]
-; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3]
; AVX2-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
@@ -3668,14 +3724,14 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm13
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0]
; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm13, %ymm8, %ymm8
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15]
-; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm0
-; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
+; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm0
+; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7]
; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm13, %xmm12
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4],ymm15[5],ymm1[6,7],ymm15[8],ymm1[9,10],ymm15[11],ymm1[12],ymm15[13],ymm1[14,15]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm2
; AVX2-FCP-NEXT: vpermd %ymm13, %ymm10, %ymm10
; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
@@ -3686,13 +3742,14 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm10
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm7
+; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13
; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm13, %ymm10, %ymm13
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8],ymm3[9],ymm0[10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6],xmm0[7]
; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm0
@@ -3704,14 +3761,15 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7]
; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm10, %ymm0
; AVX2-FCP-NEXT: vpermd %ymm12, %ymm14, %ymm9
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
-; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
+; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm9
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm10
; AVX2-FCP-NEXT: vpermd %ymm10, %ymm14, %ymm8
-; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm8
+; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8
+; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm9
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm8[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3728,14 +3786,10 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
-; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm9
-; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15]
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,2,0,0,5,7,2,4]
; AVX2-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm8
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
@@ -3746,65 +3800,64 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0]
; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm11
-; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm11
+; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm11
+; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm5
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm11[5,6,7],ymm0[8,9,10,11,12],ymm11[13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm6
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3],ymm0[4],ymm3[5,6],ymm0[7],ymm3[8,9],ymm0[10],ymm3[11],ymm0[12],ymm3[13,14],ymm0[15]
+; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15]
+; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm9
; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4],xmm11[5,6,7]
; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1
-; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15]
; AVX2-FCP-NEXT: vpermd %ymm11, %ymm14, %ymm11
; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX2-FCP-NEXT: vpermd %ymm10, %ymm8, %ymm2
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
-; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm7
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7]
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm11, %ymm2
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
-; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15]
+; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm11
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,0,0,5,0,2,7]
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [4,1,6,3]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15]
+; AVX2-FCP-NEXT: vpermd %ymm8, %ymm13, %ymm8
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0]
-; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermd %ymm12, %ymm2, %ymm8
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm8
+; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,4,7,0,2,4,7,0]
+; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm8
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm8
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4],ymm0[5],ymm6[6,7],ymm0[8],ymm6[9,10],ymm0[11],ymm6[12],ymm0[13],ymm6[14,15]
-; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm8
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15]
-; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm9
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm4[1,2],ymm9[3],ymm4[4],ymm9[5],ymm4[6,7],ymm9[8],ymm4[9,10],ymm9[11],ymm4[12],ymm9[13],ymm4[14,15]
+; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm8
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm1
+; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15]
; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm14
-; AVX2-FCP-NEXT: vpermd %ymm1, %ymm11, %ymm1
-; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpermd %ymm10, %ymm2, %ymm1
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3],ymm5[4],ymm7[5,6],ymm5[7],ymm7[8,9],ymm5[10],ymm7[11],ymm5[12],ymm7[13,14],ymm5[15]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15]
+; AVX2-FCP-NEXT: vpermd %ymm13, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm10, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm11[2],mem[3],ymm11[4],mem[5,6],ymm11[7],mem[8,9],ymm11[10],mem[11],ymm11[12],mem[13,14],ymm11[15]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,0,0,6,0,3,5]
@@ -3820,12 +3873,12 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7]
; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15]
+; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm5 = mem[0,1],ymm14[2],mem[3],ymm14[4],mem[5,6],ymm14[7],mem[8,9],ymm14[10],mem[11],ymm14[12],mem[13,14],ymm14[15]
; AVX2-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm2
; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
@@ -3849,338 +3902,345 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8)
; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r9)
; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9)
-; AVX2-FCP-NEXT: addq $296, %rsp # imm = 0x128
+; AVX2-FCP-NEXT: addq $264, %rsp # imm = 0x108
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i16_stride5_vf32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa 256(%rdi), %ymm0
-; AVX512-NEXT: vmovdqa 288(%rdi), %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
+; AVX512-NEXT: vmovdqa 192(%rdi), %ymm0
+; AVX512-NEXT: vmovdqa 224(%rdi), %ymm1
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
-; AVX512-NEXT: vmovdqa 192(%rdi), %ymm3
-; AVX512-NEXT: vmovdqa 224(%rdi), %ymm9
-; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10,11],ymm3[12],ymm9[13],ymm3[14],ymm9[15]
-; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
-; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm5
-; AVX512-NEXT: vmovdqa64 176(%rdi), %xmm20
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa 160(%rdi), %xmm6
-; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4],ymm4[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
+; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm4
+; AVX512-NEXT: vmovdqa 176(%rdi), %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,1,2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa 160(%rdi), %xmm5
+; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa 128(%rdi), %xmm6
+; AVX512-NEXT: vmovdqa 144(%rdi), %xmm7
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0],xmm7[1],xmm6[2,3]
+; AVX512-NEXT: vpshufb %xmm3, %xmm8, %xmm3
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm10
; AVX512-NEXT: vmovdqa (%rdi), %ymm8
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm10
-; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4
-; AVX512-NEXT: vmovdqa 96(%rdi), %ymm5
-; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm9
+; AVX512-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX512-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7]
; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm13
; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
-; AVX512-NEXT: vpor %ymm11, %ymm12, %ymm15
-; AVX512-NEXT: vmovdqa 144(%rdi), %xmm11
-; AVX512-NEXT: vmovdqa 128(%rdi), %xmm12
-; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0],xmm11[1],xmm12[2,3]
-; AVX512-NEXT: vpshufb %xmm7, %xmm13, %xmm7
-; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm18 & (zmm7 ^ zmm15))
-; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm16
-; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15]
-; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm14
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7]
+; AVX512-NEXT: vpor %ymm11, %ymm12, %ymm11
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm10 ^ (zmm16 & (zmm11 ^ zmm10))
+; AVX512-NEXT: vextracti64x4 $1, %zmm11, %ymm13
+; AVX512-NEXT: vmovdqa 256(%rdi), %ymm10
+; AVX512-NEXT: vmovdqa 288(%rdi), %ymm12
+; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7],ymm12[8,9],ymm10[10],ymm12[11],ymm10[12],ymm12[13,14],ymm10[15]
+; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm14
+; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm17
+; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4,5,6,7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
-; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7
-; AVX512-NEXT: vpsrlq $48, %xmm20, %xmm15
-; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-NEXT: vmovdqa %ymm0, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm15
-; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9]
-; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10],ymm8[11],ymm10[12,13],ymm8[14],ymm10[15]
-; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm0
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3],xmm15[4,5,6],xmm0[7]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u]
-; AVX512-NEXT: vpor %ymm0, %ymm13, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3]
+; AVX512-NEXT: vpshufb %ymm14, %ymm13, %ymm13
+; AVX512-NEXT: vpsrlq $48, %xmm2, %xmm15
+; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,3,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3]
; AVX512-NEXT: vpshufb %xmm14, %xmm13, %xmm13
; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm0))
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm19
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
-; AVX512-NEXT: vpshufb %ymm7, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm7, %ymm21
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,1,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm15
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm20[2],xmm7[3],xmm20[3]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
-; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm13
-; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3,4],xmm13[5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3],ymm3[4],ymm9[5,6],ymm3[7],ymm9[8,9],ymm3[10],ymm9[11],ymm3[12],ymm9[13,14],ymm3[15]
-; AVX512-NEXT: vmovdqa64 %ymm9, %ymm20
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm13
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
+; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6],xmm15[7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u]
+; AVX512-NEXT: vpor %ymm13, %ymm14, %ymm13
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm11 ^ (zmm16 & (zmm13 ^ zmm11))
+; AVX512-NEXT: vextracti64x4 $1, %zmm13, %ymm11
+; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4],ymm10[5],ymm12[6,7],ymm10[8],ymm12[9,10],ymm10[11],ymm12[12],ymm10[13],ymm12[14,15]
+; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9]
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm18
+; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
+; AVX512-NEXT: vpshufb %ymm14, %ymm11, %ymm11
+; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[0,1,1,3]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm15 = xmm7[0],xmm6[1],xmm7[2,3]
+; AVX512-NEXT: vpshufb %xmm14, %xmm15, %xmm14
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11
+; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
+; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4],xmm14[5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm15[2,3,0,1]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm11 ^ (zmm16 & (zmm13 ^ zmm11))
+; AVX512-NEXT: vextracti64x4 $1, %zmm13, %ymm11
+; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13],ymm10[14],ymm12[15]
+; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm16
+; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
+; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4,5,6,7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
-; AVX512-NEXT: vpshufb %ymm13, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3]
-; AVX512-NEXT: vmovdqa64 %xmm15, %xmm22
-; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm0[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3]
-; AVX512-NEXT: vpshufb %xmm13, %xmm14, %xmm13
+; AVX512-NEXT: vpshufb %ymm13, %ymm11, %ymm11
+; AVX512-NEXT: vpblendd {{.*#+}} xmm15 = xmm2[0],xmm5[1],xmm2[2,3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm15 = xmm7[0,1],xmm6[2],xmm7[3]
+; AVX512-NEXT: vpshufb %xmm13, %xmm15, %xmm13
; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0
-; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15]
-; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
+; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm15
+; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3]
; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,3,0,1]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7]
; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
-; AVX512-NEXT: vmovdqa %ymm2, %ymm9
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm11 ^ (mem & (zmm13 ^ zmm11))
+; AVX512-NEXT: vextracti64x4 $1, %zmm13, %ymm11
+; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13],ymm12[14],ymm10[15]
; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15
; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (mem & (zmm13 ^ zmm0))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm13, %ymm14
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm17
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1],ymm8[2],ymm10[3],ymm8[4],ymm10[5,6],ymm8[7],ymm10[8,9],ymm8[10],ymm10[11],ymm8[12],ymm10[13,14],ymm8[15]
-; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3]
-; AVX512-NEXT: vmovdqa64 %ymm21, %ymm2
-; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm11
-; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm13))
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7
-; AVX512-NEXT: vmovdqa64 %ymm20, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm3
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15]
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm11[0],ymm14[1,2,3,4,5,6,7],ymm11[8],ymm14[9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm5[2],xmm2[3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX512-NEXT: movb $7, %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10],ymm9[11],ymm1[12,13],ymm9[14],ymm1[15]
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7]
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa64 %zmm16, (%rsi)
-; AVX512-NEXT: vmovdqa64 %zmm19, (%rdx)
-; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx)
-; AVX512-NEXT: vmovdqa64 %zmm17, (%r8)
+; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi)
+; AVX512-NEXT: vmovdqa64 %zmm18, (%rdx)
+; AVX512-NEXT: vmovdqa64 %zmm16, (%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm11, (%r8)
; AVX512-NEXT: vmovdqa64 %zmm0, (%r9)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride5_vf32:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm4
-; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0]
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0]
-; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm0
-; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
+; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,1,0,0,3,5,0]
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm20
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpandnq %zmm5, %zmm13, %zmm7
; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm10
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm11
; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm8
; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3]
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %ymm7, %ymm12, %ymm12
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0]
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm7
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm13
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm13
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm12))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm16
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6],xmm13[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm12
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,0,0,0,4,6,1,3]
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm12[1,2,3],xmm6[4,5],xmm12[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %ymm5, %ymm6, %ymm12
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (zmm12 & zmm13) | zmm7
+; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm5
+; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [8,9,10,11,12,21,22,23]
+; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm18, %zmm7
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm12, %zmm16
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm12[2,3],xmm7[4,5,6],xmm12[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6]
-; AVX512-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0]
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm12))
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm6
-; AVX512-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
+; AVX512-FCP-NEXT: vpermd %ymm14, %ymm17, %ymm14
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpor %ymm7, %ymm14, %ymm14
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm7
+; AVX512-FCP-NEXT: vpsrlq $48, %xmm0, %xmm12
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0]
; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,2,0,1,3,6,0]
+; AVX512-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm15
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm21
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm12
+; AVX512-FCP-NEXT: vpandnq %zmm12, %zmm13, %zmm12
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (zmm14 & zmm13) | zmm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15]
; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3]
; AVX512-FCP-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermd %ymm15, %ymm17, %ymm15
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25]
+; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm12
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm19
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4],xmm14[5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,2,0,0,5,7,2,4]
+; AVX512-FCP-NEXT: vpermd %ymm15, %ymm17, %ymm15
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,3,5,2,5,7,0,0]
; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm15, %zmm17
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4],xmm15[5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0]
-; AVX512-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm12
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4]
-; AVX512-FCP-NEXT: vpermd %ymm15, %ymm19, %ymm15
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0]
; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm14
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
+; AVX512-FCP-NEXT: vpermd %ymm4, %ymm15, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm7
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm15
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm12
+; AVX512-FCP-NEXT: vpandnq %zmm12, %zmm13, %zmm12
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (zmm14 & zmm13) | zmm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4],xmm15[5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm12
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm13
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [4,1,6,3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
+; AVX512-FCP-NEXT: vpermd %ymm14, %ymm12, %ymm12
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7]
-; AVX512-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0]
-; AVX512-FCP-NEXT: vpermd %ymm15, %ymm18, %ymm15
+; AVX512-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm14
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [1,3,6,0,5,0,0,0]
+; AVX512-FCP-NEXT: vpermd %ymm15, %ymm17, %ymm15
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0]
; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm6))
-; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6
+; AVX512-FCP-NEXT: vpermd %ymm4, %ymm15, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm7
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm7
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm12))
+; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm7, %ymm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm12[0],ymm14[1,2,3,4,5,6,7],ymm12[8],ymm14[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm7
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7]
@@ -4190,366 +4250,373 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0]
-; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7]
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0]
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
+; AVX512-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512-FCP-NEXT: movb $7, %al
; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1}
-; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1}
+; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%r8)
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r9)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride5_vf32:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm0
-; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
+; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm0
+; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
-; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm3
-; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm9
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10,11],ymm3[12],ymm9[13],ymm3[14],ymm9[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
-; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm5
-; AVX512DQ-NEXT: vmovdqa64 176(%rdi), %xmm20
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm6
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4],ymm4[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm4
+; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm2
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm5
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm6
+; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0],xmm7[1],xmm6[2,3]
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm8, %xmm3
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm10
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm10
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm9
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm13
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %ymm11, %ymm12, %ymm15
-; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm11
-; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm12
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0],xmm11[1],xmm12[2,3]
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm13, %xmm7
-; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm18 & (zmm7 ^ zmm15))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm16
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm14
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7]
+; AVX512DQ-NEXT: vpor %ymm11, %ymm12, %ymm11
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm10 ^ (zmm16 & (zmm11 ^ zmm10))
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm11, %ymm13
+; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm10
+; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm12
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7],ymm12[8,9],ymm10[10],ymm12[11],ymm10[12],ymm12[13,14],ymm10[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm14
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm11, %zmm17
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
-; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm7
-; AVX512DQ-NEXT: vpsrlq $48, %xmm20, %xmm15
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm15
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10],ymm8[11],ymm10[12,13],ymm8[14],ymm10[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3],xmm15[4,5,6],xmm0[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %ymm0, %ymm13, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3]
+; AVX512DQ-NEXT: vpshufb %ymm14, %ymm13, %ymm13
+; AVX512DQ-NEXT: vpsrlq $48, %xmm2, %xmm15
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,3,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3]
; AVX512DQ-NEXT: vpshufb %xmm14, %xmm13, %xmm13
; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm0))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm19
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm7
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
-; AVX512DQ-NEXT: vpshufb %ymm7, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm21
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,1,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm15
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm20[2],xmm7[3],xmm20[3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm13
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3,4],xmm13[5,6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3],ymm3[4],ymm9[5,6],ymm3[7],ymm9[8,9],ymm3[10],ymm9[11],ymm3[12],ymm9[13,14],ymm3[15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm20
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm13
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6],xmm15[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %ymm13, %ymm14, %ymm13
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm11 ^ (zmm16 & (zmm13 ^ zmm11))
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm13, %ymm11
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4],ymm10[5],ymm12[6,7],ymm10[8],ymm12[9,10],ymm10[11],ymm12[12],ymm10[13],ymm12[14,15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm18
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
+; AVX512DQ-NEXT: vpshufb %ymm14, %ymm11, %ymm11
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm5[0,1,1,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm15 = xmm7[0],xmm6[1],xmm7[2,3]
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm15, %xmm14
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm14, %zmm11
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4],xmm14[5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm15[2,3,0,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm11 ^ (zmm16 & (zmm13 ^ zmm11))
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm13, %ymm11
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13],ymm10[14],ymm12[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm16
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4,5,6,7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
-; AVX512DQ-NEXT: vpshufb %ymm13, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm22
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm0[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3]
-; AVX512DQ-NEXT: vpshufb %xmm13, %xmm14, %xmm13
+; AVX512DQ-NEXT: vpshufb %ymm13, %ymm11, %ymm11
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm15 = xmm2[0],xmm5[1],xmm2[2,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm11[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm15 = xmm7[0,1],xmm6[2],xmm7[3]
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm15, %xmm13
; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm15
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm15[2,3,0,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
-; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm9
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm11 ^ (mem & (zmm13 ^ zmm11))
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm13, %ymm11
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5],ymm12[6],ymm10[7,8],ymm12[9],ymm10[10,11],ymm12[12],ymm10[13],ymm12[14],ymm10[15]
; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (mem & (zmm13 ^ zmm0))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm13, %ymm14
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm17
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1],ymm8[2],ymm10[3],ymm8[4],ymm10[5,6],ymm8[7],ymm10[8,9],ymm8[10],ymm10[11],ymm8[12],ymm10[13,14],ymm8[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm2
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm11
-; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm13))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7
-; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm11[0],ymm14[1,2,3,4,5,6,7],ymm11[8],ymm14[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm11
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm5[2],xmm2[3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX512DQ-NEXT: movb $7, %al
; AVX512DQ-NEXT: kmovw %eax, %k1
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10],ymm9[11],ymm1[12,13],ymm9[14],ymm1[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7]
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rsi)
-; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rdx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rcx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%r8)
+; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi)
+; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r8)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i16_stride5_vf32:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm5, %ymm6
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,1,0,0,3,5,0]
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm20
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpandnq %zmm5, %zmm13, %zmm7
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm10
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm11
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm8
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3]
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0]
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm13
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm13
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm12))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm16
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6],xmm13[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,0,0,0,4,6,1,3]
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm12[1,2,3],xmm6[4,5],xmm12[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm6, %ymm12
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = (zmm12 & zmm13) | zmm7
+; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm6[0,1],ymm5[2],ymm6[3],ymm5[4],ymm6[5,6],ymm5[7],ymm6[8,9],ymm5[10],ymm6[11],ymm5[12],ymm6[13,14],ymm5[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [8,9,10,11,12,21,22,23]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm18, %zmm7
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm12, %zmm16
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm12[2,3],xmm7[4,5,6],xmm12[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6]
-; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm12))
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm6
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm17, %ymm14
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm14, %ymm14
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm7
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm0, %xmm12
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0]
; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm12[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,2,0,1,3,6,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm21
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm12
+; AVX512DQ-FCP-NEXT: vpandnq %zmm12, %zmm13, %zmm12
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (zmm14 & zmm13) | zmm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0],ymm6[1,2],ymm5[3],ymm6[4],ymm5[5],ymm6[6,7],ymm5[8],ymm6[9,10],ymm5[11],ymm6[12],ymm5[13],ymm6[14,15]
; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3]
; AVX512DQ-FCP-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm17, %ymm15
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm12
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm19
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4],xmm14[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,2,0,0,5,7,2,4]
+; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm17, %ymm15
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,3,5,2,5,7,0,0]
; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm15, %zmm17
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4],xmm15[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm12
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4]
-; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm19, %ymm15
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0]
; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm14
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm15, %zmm12
+; AVX512DQ-FCP-NEXT: vpandnq %zmm12, %zmm13, %zmm12
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = (zmm14 & zmm13) | zmm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4],xmm15[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm18, %zmm12
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm14, %zmm13
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [4,1,6,3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm12, %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm18, %ymm15
+; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm14
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [1,3,6,0,5,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm17, %ymm15
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0]
; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm6))
-; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6
+; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm7
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm12))
+; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm7, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm12[0],ymm14[1,2,3,4,5,6,7],ymm12[8],ymm14[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm7
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7]
@@ -4559,34 +4626,34 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-FCP-NEXT: movb $7, %al
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r8)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -5863,13 +5930,13 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa 256(%rdi), %xmm2
-; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3]
+; AVX-NEXT: vmovdqa 256(%rdi), %xmm14
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,1,1,3]
+; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
-; AVX-NEXT: vmovdqa 272(%rdi), %xmm15
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm15[1]
-; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 272(%rdi), %xmm13
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm13[1]
+; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 240(%rdi), %xmm3
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
@@ -5883,7 +5950,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7]
; AVX-NEXT: vmovdqa 176(%rdi), %xmm3
-; AVX-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
; AVX-NEXT: vmovdqa 160(%rdi), %xmm4
@@ -5909,44 +5976,44 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1]
; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 560(%rdi), %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
; AVX-NEXT: vmovdqa 624(%rdi), %xmm3
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa 608(%rdi), %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
+; AVX-NEXT: vmovdqa 608(%rdi), %xmm9
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3],xmm9[4,5,6,7]
+; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
; AVX-NEXT: vmovdqa 496(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovdqa 480(%rdi), %xmm9
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
-; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 480(%rdi), %xmm10
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[0,2,2,3]
+; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX-NEXT: vmovdqa 528(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3]
-; AVX-NEXT: vmovdqa 512(%rdi), %xmm13
-; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6,7]
-; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 512(%rdi), %xmm7
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm7[4],xmm4[5,6,7]
+; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7]
; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3
-; AVX-NEXT: vmovaps 544(%rdi), %xmm11
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm11[0,1,0,1]
-; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 544(%rdi), %xmm8
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[0,1,0,1]
+; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vandnps %ymm4, %ymm5, %ymm4
; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 96(%rdi), %xmm10
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,1,3]
-; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 96(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
; AVX-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -5956,20 +6023,20 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
-; AVX-NEXT: vmovdqa 144(%rdi), %xmm7
-; AVX-NEXT: vmovdqa 128(%rdi), %xmm6
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7]
-; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 144(%rdi), %xmm3
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 128(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7]
; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
; AVX-NEXT: vmovdqa 16(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
+; AVX-NEXT: vmovdqa (%rdi), %xmm6
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
+; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX-NEXT: vmovdqa 32(%rdi), %xmm0
@@ -5987,11 +6054,11 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 464(%rdi), %xmm8
+; AVX-NEXT: vmovdqa 464(%rdi), %xmm2
+; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 448(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX-NEXT: vmovdqa 416(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -6025,13 +6092,12 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vandps %ymm5, %ymm2, %ymm2
; AVX-NEXT: vmovaps 384(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,1,0,1]
-; AVX-NEXT: vandnps %ymm3, %ymm5, %ymm3
-; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2
+; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,1,0,1]
+; AVX-NEXT: vandnps %ymm15, %ymm5, %ymm15
+; AVX-NEXT: vorps %ymm2, %ymm15, %ymm2
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
@@ -6039,99 +6105,101 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
+; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7]
+; AVX-NEXT: vpshufb %xmm2, %xmm15, %xmm15
+; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4],xmm15[5,6,7]
; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX-NEXT: # xmm1 = mem[0,3,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7]
-; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpsrlq $48, %xmm0, %xmm15
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7]
+; AVX-NEXT: vpsrlq $48, %xmm0, %xmm14
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm13[0,1],mem[2,3],xmm13[4,5],mem[6,7]
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7]
; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3,4,5,6,7]
; AVX-NEXT: vandps %ymm5, %ymm0, %ymm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpsllq $48, %xmm4, %xmm15
-; AVX-NEXT: vandnps %ymm15, %ymm5, %ymm15
-; AVX-NEXT: vorps %ymm0, %ymm15, %ymm0
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vpsllq $48, %xmm3, %xmm14
+; AVX-NEXT: vandnps %ymm14, %ymm5, %ymm14
+; AVX-NEXT: vorps %ymm0, %ymm14, %ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[0,3,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[0,1,2,3],xmm14[4,5],mem[6,7]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,3,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpsrlq $48, %xmm4, %xmm15
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1]
+; AVX-NEXT: vpshufd $236, (%rsp), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[0,3,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm9[0,1,2,3],mem[4,5],xmm9[6,7]
+; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm14[5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[0,3,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT: vpsrlq $48, %xmm12, %xmm15
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm9[2,3],xmm13[4,5],xmm9[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2,3],xmm7[4,5],xmm9[6,7]
; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm15
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7]
-; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3
-; AVX-NEXT: vpsllq $48, %xmm11, %xmm15
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7]
+; AVX-NEXT: vandps %ymm5, %ymm14, %ymm14
+; AVX-NEXT: vpsllq $48, %xmm8, %xmm15
; AVX-NEXT: vandnps %ymm15, %ymm5, %ymm15
-; AVX-NEXT: vorps %ymm3, %ymm15, %ymm3
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX-NEXT: vorps %ymm15, %ymm14, %ymm14
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm10[2,3],xmm13[4,5,6,7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,3,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX-NEXT: vpsrlq $48, %xmm10, %xmm15
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,3,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7]
+; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm14[5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,3,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vpsrlq $48, %xmm7, %xmm15
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm15
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7]
-; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vpsllq $48, %xmm7, %xmm15
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7]
+; AVX-NEXT: vandps %ymm5, %ymm14, %ymm14
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT: vpsllq $48, %xmm6, %xmm15
; AVX-NEXT: vandnps %ymm15, %ymm5, %ymm15
-; AVX-NEXT: vorps %ymm3, %ymm15, %ymm3
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX-NEXT: vorps %ymm15, %ymm14, %ymm14
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm8[4,5],mem[6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = mem[0,1],xmm8[2,3],mem[4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,0,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
-; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[0,3,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7]
+; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[0,3,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,2,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
@@ -6140,9 +6208,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; AVX-NEXT: # xmm2 = mem[0,3,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT: vpsrlq $48, %xmm3, %xmm3
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX-NEXT: vpsrlq $48, %xmm14, %xmm14
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
@@ -6161,98 +6229,98 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
+; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7]
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = mem[0,1],xmm1[2,3],mem[4,5,6,7]
+; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4],xmm5[5,6,7]
+; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm13[0,1],mem[2,3],xmm13[4,5,6,7]
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5
+; AVX-NEXT: vpshufb %xmm1, %xmm14, %xmm14
; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = mem[0,1,1,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
-; AVX-NEXT: vpunpckhdq (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5],xmm15[6,7]
; AVX-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = mem[0,1,2,0]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5],mem[6,7]
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3
-; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = mem[3,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7]
-; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7]
-; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
-; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = mem[0,1],xmm9[2,3],mem[4,5,6,7]
-; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5],mem[6,7]
+; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX-NEXT: vpshufd $231, (%rsp), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[3,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[0,1],xmm13[2,3],mem[4,5,6,7]
+; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm14[5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1],xmm9[2,3],xmm13[4,5,6,7]
+; AVX-NEXT: vpshufb %xmm1, %xmm14, %xmm14
; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = mem[0,1,1,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7]
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5],xmm15[6,7]
; AVX-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = mem[0,1,2,0]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7]
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[3,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = mem[0,1],xmm5[2,3],mem[4,5,6,7]
-; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7]
-; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5
-; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,1,1,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7]
+; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[3,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[0,1],xmm8[2,3],mem[4,5,6,7]
+; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm14[5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
+; AVX-NEXT: vpshufb %xmm1, %xmm14, %xmm14
+; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = mem[0,1,1,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7]
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,2,0]
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm7[2],xmm15[3],xmm7[3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4,5],xmm15[6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,1,2,0]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm3
; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7]
+; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm2
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7]
-; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7]
+; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm0
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3,4,5,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,1,3]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,0]
@@ -6268,227 +6336,232 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vpsrlq $48, %xmm2, %xmm2
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7]
+; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7]
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7]
-; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
+; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2,3,4],xmm5[5,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload
-; AVX-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
+; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = xmm15[0,1,2,3],mem[4,5],xmm15[6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7]
; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
; AVX-NEXT: # xmm15 = mem[0,1,0,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7]
+; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7]
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15]
-; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5
+; AVX-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload
+; AVX-NEXT: vpsrlq $48, %xmm14, %xmm14
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1,2,3,4,5,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpsrlq $48, %xmm0, %xmm5
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
+; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm14[5,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
-; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
-; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7]
+; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[0,1],xmm0[2,3],mem[4,5,6,7]
; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload
-; AVX-NEXT: # xmm15 = mem[0,1,2,3],xmm14[4,5],mem[6,7]
+; AVX-NEXT: vpshufb %xmm0, %xmm14, %xmm14
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = xmm15[0,1,2,3],mem[4,5],xmm15[6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7]
+; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = mem[0,1,0,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5],xmm7[6,7]
-; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX-NEXT: vpsrlq $48, %xmm8, %xmm5
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7]
-; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7]
-; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5
-; AVX-NEXT: vmovdqa %xmm0, %xmm4
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7]
+; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5
+; AVX-NEXT: vpsrlq $48, %xmm8, %xmm14
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7]
+; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm14[5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
+; AVX-NEXT: vpshufb %xmm0, %xmm14, %xmm14
+; AVX-NEXT: vmovdqa %xmm0, %xmm6
; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm3
; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7]
-; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7]
-; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vpsrlq $48, %xmm7, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = mem[0,1,2,3],xmm9[4,5],mem[6,7]
+; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm2
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7]
-; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm1
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm8[4,5],xmm4[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm0
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vpsrlq $48, %xmm8, %xmm5
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm13[2,3],xmm10[4,5,6,7]
+; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm1
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1,2,3],xmm6[4,5],xmm11[6,7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,3]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,1,0,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[3,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = mem[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = mem[0,3,2,3]
-; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = xmm1[0,1,2],mem[3],xmm1[4,5,6,7]
-; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = mem[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = xmm1[0,1,2,3],mem[4,5],xmm1[6,7]
-; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = mem[1,1,1,1]
+; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[0,3,2,3]
+; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0,1,2],mem[3],xmm0[4,5,6,7]
+; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = mem[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7]
+; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[1,1,1,1]
; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
; AVX-NEXT: # xmm5 = mem[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3,4,5],xmm2[6,7]
; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
; AVX-NEXT: # xmm5 = mem[0,1,1,3]
; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[3,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = mem[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = mem[0,3,2,3]
-; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7]
-; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX-NEXT: # xmm15 = mem[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm5[1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7]
-; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX-NEXT: # xmm15 = mem[1,1,1,1]
-; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = mem[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1]
-; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm12[3,4,5],xmm5[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,1,3]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
-; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = mem[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm11[3],xmm12[4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm5[4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[1,1,1,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1]
-; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm12
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5],xmm12[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,1,3]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm5
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
; AVX-NEXT: # xmm5 = mem[3,1,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
-; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; AVX-NEXT: # xmm13 = mem[0,2,2,3]
+; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = mem[0,3,2,3]
+; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm2[0,1,2],mem[3],xmm2[4,5,6,7]
+; AVX-NEXT: vpshufd $238, (%rsp), %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = mem[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = mem[0,1,2,3],xmm5[4,5],mem[6,7]
+; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[1,1,1,1]
+; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = mem[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,0,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
+; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3,4,5],xmm5[6,7]
+; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[0,1,1,3]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm14, %ymm14
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm14[3,4,5,6,7]
+; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[3,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,1,4,5,6,7]
+; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = mem[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,0,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm14[4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm4[3],xmm5[4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0,1,2,3],xmm13[4,5],xmm10[6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm11[1,1,1,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
-; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; AVX-NEXT: # xmm13 = mem[0,3,2,3]
-; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT: # xmm11 = xmm13[0,1,2],mem[3],xmm13[4,5,6,7]
-; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX-NEXT: # xmm10 = mem[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX-NEXT: vpshufb %xmm0, %xmm14, %xmm13
+; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2],xmm12[3,4,5],xmm13[6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,1,1,3]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,4,7]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm10
+; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7]
+; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX-NEXT: # xmm12 = mem[3,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,1,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm9[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm10
+; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = mem[0,3,2,3]
+; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7]
+; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX-NEXT: # xmm9 = mem[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1,2,3,4,5,6,7]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7]
-; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm2
+; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT: # xmm7 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7]
+; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm0
; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
; AVX-NEXT: # xmm4 = mem[1,1,1,1]
-; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX-NEXT: # xmm7 = mem[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7]
-; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = mem[0,1,1,3]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
+; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX-NEXT: # xmm6 = mem[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3,4,5],xmm0[6,7]
+; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = mem[0,1,1,3]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = mem[3,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
+; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm3, 64(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, (%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, 96(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, 32(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, (%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 96(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 32(%rsi)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm3, 64(%rdx)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -6513,9 +6586,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %ymm3, 96(%r8)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm3, 32(%r8)
-; AVX-NEXT: vmovaps %ymm2, 64(%r9)
-; AVX-NEXT: vmovaps %ymm12, (%r9)
-; AVX-NEXT: vmovaps %ymm0, 96(%r9)
+; AVX-NEXT: vmovaps %ymm0, 64(%r9)
+; AVX-NEXT: vmovaps %ymm10, (%r9)
+; AVX-NEXT: vmovaps %ymm2, 96(%r9)
; AVX-NEXT: vmovaps %ymm1, 32(%r9)
; AVX-NEXT: addq $1032, %rsp # imm = 0x408
; AVX-NEXT: vzeroupper
@@ -6523,978 +6596,972 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX2-LABEL: load_i16_stride5_vf64:
; AVX2: # %bb.0:
-; AVX2-NEXT: subq $1048, %rsp # imm = 0x418
-; AVX2-NEXT: vmovdqa 384(%rdi), %ymm10
-; AVX2-NEXT: vmovdqa 512(%rdi), %ymm4
-; AVX2-NEXT: vmovdqa 480(%rdi), %ymm14
-; AVX2-NEXT: vmovdqa 544(%rdi), %ymm11
+; AVX2-NEXT: subq $984, %rsp # imm = 0x3D8
+; AVX2-NEXT: vmovdqa 384(%rdi), %ymm11
+; AVX2-NEXT: vmovdqa 512(%rdi), %ymm14
+; AVX2-NEXT: vmovdqa 480(%rdi), %ymm15
+; AVX2-NEXT: vmovdqa 544(%rdi), %ymm7
+; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa 576(%rdi), %ymm8
; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 192(%rdi), %ymm3
-; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 160(%rdi), %ymm5
-; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa 160(%rdi), %ymm6
+; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa 224(%rdi), %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa 256(%rdi), %ymm1
-; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm3
-; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4],ymm11[5],ymm8[6,7],ymm11[8],ymm8[9,10],ymm11[11],ymm8[12],ymm11[13],ymm8[14,15]
-; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15]
-; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa %ymm4, %ymm8
-; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7]
-; AVX2-NEXT: vmovdqa 416(%rdi), %ymm13
-; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2
-; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm3
-; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm13[1,2],ymm10[3],ymm13[4],ymm10[5],ymm13[6,7],ymm10[8],ymm13[9,10],ymm10[11],ymm13[12],ymm10[13],ymm13[14,15]
-; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
-; AVX2-NEXT: vmovdqa 352(%rdi), %ymm4
-; AVX2-NEXT: vmovdqa 320(%rdi), %ymm15
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5],ymm4[6],ymm15[7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13],ymm4[14],ymm15[15]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
+; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13],ymm2[14],ymm6[15]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4,5],xmm2[6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0]
+; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa %ymm4, %ymm6
-; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7]
-; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2
-; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm3
-; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 64(%rdi), %ymm9
-; AVX2-NEXT: vmovdqa 96(%rdi), %ymm12
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm12[1,2],ymm9[3],ymm12[4],ymm9[5],ymm12[6,7],ymm9[8],ymm12[9,10],ymm9[11],ymm12[12],ymm9[13],ymm12[14,15]
-; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
-; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vmovdqa (%rdi), %ymm5
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm7
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15]
-; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
-; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vmovdqa 416(%rdi), %ymm9
+; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm6, %ymm5
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm9[1,2],ymm11[3],ymm9[4],ymm11[5],ymm9[6,7],ymm11[8],ymm9[9,10],ymm11[11],ymm9[12],ymm11[13],ymm9[14,15]
+; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT: vmovdqa 352(%rdi), %ymm8
+; AVX2-NEXT: vmovdqa 320(%rdi), %ymm10
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15]
+; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
+; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa 64(%rdi), %ymm7
+; AVX2-NEXT: vmovdqa 96(%rdi), %ymm12
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm12[1,2],ymm7[3],ymm12[4],ymm7[5],ymm12[6,7],ymm7[8],ymm12[9,10],ymm7[11],ymm12[12],ymm7[13],ymm12[14,15]
+; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa (%rdi), %ymm13
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5],ymm6[6],ymm13[7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13],ymm6[14],ymm13[15]
+; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
+; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vmovdqa %ymm5, %ymm4
+; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2],mem[3],ymm1[4,5],mem[6],ymm1[7,8],mem[9],ymm1[10],mem[11],ymm1[12,13],mem[14],ymm1[15]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
+; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
+; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm1 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15]
+; AVX2-NEXT: vmovdqa %ymm4, %ymm0
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15]
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm8
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5,6],xmm8[7]
-; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm1
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm11
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13],ymm10[14],ymm13[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15]
-; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm10
-; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7]
-; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm1
-; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm8
-; AVX2-NEXT: vpblendvb %ymm3, %ymm8, %ymm1, %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10,11],ymm9[12],ymm12[13],ymm9[14],ymm12[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7]
-; AVX2-NEXT: vpshufb %ymm0, %ymm8, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15]
-; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm10
-; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7]
-; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm4
-; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa 304(%rdi), %xmm9
-; AVX2-NEXT: vmovdqa 288(%rdi), %xmm3
-; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0],xmm9[1],xmm3[2,3]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5,6],xmm4[7]
+; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm15
+; AVX2-NEXT: vmovdqa %ymm0, %ymm14
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm1[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm10[1],ymm8[2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10],ymm10[11],ymm8[12,13],ymm10[14],ymm8[15]
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm8
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3],xmm4[4,5,6],xmm8[7]
+; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm7[1],ymm12[2,3],ymm7[4],ymm12[5],ymm7[6],ymm12[7,8],ymm7[9],ymm12[10,11],ymm7[12],ymm12[13],ymm7[14],ymm12[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm8[5],ymm4[6,7]
+; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15]
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm8
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3],xmm4[4,5,6],xmm8[7]
+; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm0
+; AVX2-NEXT: vpblendvb %ymm14, %ymm0, %ymm3, %ymm1
+; AVX2-NEXT: vmovdqa 288(%rdi), %xmm11
+; AVX2-NEXT: vmovdqa 304(%rdi), %xmm10
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0],xmm10[1],xmm11[2,3]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX2-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa 608(%rdi), %xmm9
+; AVX2-NEXT: vmovdqa 624(%rdi), %xmm7
+; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0],xmm7[1],xmm9[2,3]
+; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa 448(%rdi), %xmm6
+; AVX2-NEXT: vmovdqa 464(%rdi), %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0],xmm3[1],xmm6[2,3]
+; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovdqa %xmm3, %xmm4
+; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vpshufb %xmm0, %xmm8, %xmm8
; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2,3,4],ymm8[5,6,7],ymm4[8,9,10,11,12],ymm8[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 624(%rdi), %xmm15
-; AVX2-NEXT: vmovdqa 608(%rdi), %xmm12
-; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm15[1],xmm12[2,3]
-; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufb %xmm0, %xmm10, %xmm10
-; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1,2,3,4],ymm10[5,6,7],ymm4[8,9,10,11,12],ymm10[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 464(%rdi), %xmm10
-; AVX2-NEXT: vmovdqa 448(%rdi), %xmm8
-; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm8[0],xmm10[1],xmm8[2,3]
-; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm13
-; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0,1,2,3,4],ymm13[5,6,7],ymm4[8,9,10,11,12],ymm13[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 144(%rdi), %xmm5
-; AVX2-NEXT: vmovdqa 128(%rdi), %xmm4
-; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0],xmm5[1],xmm4[2,3]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2,3,4],ymm8[5,6,7],ymm3[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa 128(%rdi), %xmm5
+; AVX2-NEXT: vmovdqa 144(%rdi), %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm5[0],xmm3[1],xmm5[2,3]
; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm0
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa %xmm3, %xmm14
; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm9[2],xmm3[3]
-; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm15[2],xmm12[3]
-; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT: vmovdqa %xmm10, %xmm8
+; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovdqa %xmm11, %xmm13
+; AVX2-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm10[2],xmm11[3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX2-NEXT: vpshufb %xmm11, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7],ymm10[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3]
-; AVX2-NEXT: vmovdqa %xmm8, %xmm10
-; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm7[2],xmm9[3]
+; AVX2-NEXT: vmovdqa %xmm9, %xmm14
+; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovdqa %xmm7, %xmm12
+; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpshufb %xmm11, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7],ymm15[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3]
-; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1],xmm4[2],xmm6[3]
+; AVX2-NEXT: vpshufb %xmm11, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT: vpblendw $82, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm3[2],xmm5[3]
+; AVX2-NEXT: vpshufb %xmm11, %xmm0, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm6[1],ymm10[2,3],ymm6[4],ymm10[5],ymm6[6],ymm10[7,8],ymm6[9],ymm10[10,11],ymm6[12],ymm10[13],ymm6[14],ymm10[15]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm1 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7],mem[8,9],ymm1[10],mem[11],ymm1[12],mem[13,14],ymm1[15]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm5
; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0],xmm14[1],xmm9[2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm13[1],xmm8[2,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm7[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm6 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15]
-; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7]
-; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm6 = mem[0],xmm12[1],mem[2,3]
-; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6],ymm5[7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm7 = ymm9[0,1],mem[2],ymm9[3],mem[4],ymm9[5,6],mem[7],ymm9[8,9],mem[10],ymm9[11],mem[12],ymm9[13,14],mem[15]
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm11
+; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
+; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm5
+; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm7
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0],xmm14[1],xmm12[2,3]
+; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6],ymm5[7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
-; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7]
-; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm6 = mem[0],xmm10[1],mem[2,3]
-; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm3 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
-; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm4[2],ymm8[3],ymm4[4],ymm8[5,6],ymm4[7],ymm8[8,9],ymm4[10],ymm8[11],ymm4[12],ymm8[13,14],ymm4[15]
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm11
+; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
+; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm5
+; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm7
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm7 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm7 = xmm12[0],mem[1],xmm12[2,3]
+; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15]
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3,4],xmm3[5,6,7]
-; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3]
-; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6],ymm5[7]
+; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm5 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15]
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
+; AVX2-NEXT: vpshufb %xmm0, %xmm5, %xmm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm1 = xmm5[0],mem[1],xmm5[2,3]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendw $181, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm10[1],ymm6[2],ymm10[3],ymm6[4,5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10],ymm10[11],ymm6[12,13],ymm10[14],ymm6[15]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm1 = mem[0],ymm15[1,2],mem[3],ymm15[4],mem[5],ymm15[6,7],mem[8],ymm15[9,10],mem[11],ymm15[12],mem[13],ymm15[14,15]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm1 = ymm10[0],mem[1,2],ymm10[3],mem[4],ymm10[5],mem[6,7],ymm10[8],mem[9,10],ymm10[11],mem[12],ymm10[13],mem[14,15]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX2-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm6 = mem[0,1],xmm15[2],mem[3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
-; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm3 = ymm13[0],mem[1],ymm13[2],mem[3],ymm13[4,5],mem[6],ymm13[7,8],mem[9],ymm13[10],mem[11],ymm13[12,13],mem[14],ymm13[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
-; AVX2-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm6 = mem[0],ymm11[1,2],mem[3],ymm11[4],mem[5],ymm11[6,7],mem[8],ymm11[9,10],mem[11],ymm11[12],mem[13],ymm11[14,15]
-; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
-; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX2-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm6 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm6 = mem[0,1],xmm11[2],mem[3]
-; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
-; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3]
-; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm14[2],xmm8[3]
-; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10],ymm9[11],ymm12[12,13],ymm9[14],ymm12[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
-; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15]
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3]
-; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm12[2],xmm10[3]
-; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm1 = mem[0,1],xmm1[2],mem[3]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
+; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm9[1,2],ymm15[3],ymm9[4],ymm15[5],ymm9[6,7],ymm15[8],ymm9[9,10],ymm15[11],ymm9[12],ymm15[13],ymm9[14,15]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3]
+; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm1 = mem[0,1],xmm1[2],mem[3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm1
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa %ymm13, %ymm9
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15]
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm3 = xmm12[0,1],mem[2],xmm12[3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendw $107, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15]
-; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
-; AVX2-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm2 = mem[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX2-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm2 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0],ymm12[1],ymm0[2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10],ymm12[11],ymm0[12,13],ymm12[14],ymm0[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
+; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm3
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm11[1,2],ymm1[3],ymm11[4],ymm1[5],ymm11[6,7],ymm1[8],ymm11[9,10],ymm1[11],ymm11[12],ymm1[13],ymm11[14,15]
+; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3]
+; AVX2-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm3 = xmm5[0,1],mem[2],xmm5[3]
+; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX2-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm4 = mem[0],ymm10[1],mem[2,3],ymm10[4],mem[5],ymm10[6],mem[7,8],ymm10[9],mem[10,11],ymm10[12],mem[13],ymm10[14],mem[15]
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
-; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm4 = mem[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
-; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm5 = mem[0,1],ymm9[2],mem[3],ymm9[4],mem[5,6],ymm9[7],mem[8,9],ymm9[10],mem[11],ymm9[12],mem[13,14],ymm9[15]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6],ymm6[7]
-; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX2-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-NEXT: vmovdqa 288(%rdi), %xmm6
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm5, %xmm6, %xmm6
+; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm7 = mem[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,1,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15]
-; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
-; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm3
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX2-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm6 = mem[0,1],ymm6[2],mem[3],ymm6[4],mem[5,6],ymm6[7],mem[8,9],ymm6[10],mem[11],ymm6[12],mem[13,14],ymm6[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7]
+; AVX2-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm7 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm7 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15]
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
+; AVX2-NEXT: vpshufb %ymm11, %ymm6, %ymm6
+; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm7
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7]
+; AVX2-NEXT: vmovdqa 608(%rdi), %xmm7
+; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm7
+; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm8 = mem[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6],ymm8[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15]
+; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7]
+; AVX2-NEXT: vpshufb %ymm11, %ymm7, %ymm7
+; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm8
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-NEXT: vmovdqa 448(%rdi), %xmm8
+; AVX2-NEXT: vpshufb %xmm5, %xmm8, %xmm8
+; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm9 = mem[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,1,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
+; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1],ymm12[2],ymm0[3],ymm12[4],ymm0[5,6],ymm12[7],ymm0[8,9],ymm12[10],ymm0[11],ymm12[12],ymm0[13,14],ymm12[15]
+; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6],ymm9[7]
+; AVX2-NEXT: vpshufb %ymm11, %ymm8, %ymm8
+; AVX2-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm9 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15]
+; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3,4],xmm10[5,6,7]
+; AVX2-NEXT: vpshufb %xmm4, %xmm9, %xmm4
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-NEXT: vmovdqa 128(%rdi), %xmm8
+; AVX2-NEXT: vpshufb %xmm5, %xmm8, %xmm5
+; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm8 = mem[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7]
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, 64(%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, (%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, 96(%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, 32(%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, 64(%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, (%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, 96(%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, 32(%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, 64(%rcx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, (%rcx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, 96(%rcx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, 32(%rcx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm5, 64(%r8)
-; AVX2-NEXT: vmovdqa %ymm1, (%r8)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm1, 96(%r8)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
-; AVX2-NEXT: vmovdqa %ymm4, 64(%r9)
-; AVX2-NEXT: vmovdqa %ymm3, (%r9)
-; AVX2-NEXT: vmovdqa %ymm2, 96(%r9)
-; AVX2-NEXT: vmovdqa %ymm0, 32(%r9)
-; AVX2-NEXT: addq $1048, %rsp # imm = 0x418
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 64(%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, (%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 96(%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 32(%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 64(%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, (%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 96(%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 32(%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 64(%rcx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, (%rcx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 96(%rcx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 32(%rcx)
+; AVX2-NEXT: vmovdqa %ymm13, 64(%r8)
+; AVX2-NEXT: vmovdqa %ymm2, (%r8)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 96(%r8)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 32(%r8)
+; AVX2-NEXT: vmovdqa %ymm7, 64(%r9)
+; AVX2-NEXT: vmovdqa %ymm4, (%r9)
+; AVX2-NEXT: vmovdqa %ymm6, 96(%r9)
+; AVX2-NEXT: vmovdqa %ymm3, 32(%r9)
+; AVX2-NEXT: addq $984, %rsp # imm = 0x3D8
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i16_stride5_vf64:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: subq $1080, %rsp # imm = 0x438
-; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm13
-; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm5
-; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm6
-; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm7
-; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: subq $984, %rsp # imm = 0x3D8
+; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm10
+; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm3
+; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm5
+; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm11
; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm8
+; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2
; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm15
-; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm1
-; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm6
+; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm15
; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm0
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12],ymm1[13],ymm0[14,15]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3],ymm0[4],ymm15[5],ymm0[6,7],ymm15[8],ymm0[9,10],ymm15[11],ymm0[12],ymm15[13],ymm0[14,15]
+; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
-; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm3
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm2[1],ymm15[2,3],ymm2[4],ymm15[5],ymm2[6],ymm15[7,8],ymm2[9],ymm15[10,11],ymm2[12],ymm15[13],ymm2[14],ymm15[15]
-; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4,5],xmm2[6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm4
-; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2
-; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
-; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm6
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7]
-; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm14
-; AVX2-FP-NEXT: vpshufb %ymm1, %ymm3, %ymm3
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm3
-; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm7
-; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15]
-; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7]
-; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm12
-; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm11
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23]
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm2[1],ymm6[2,3],ymm2[4],ymm6[5],ymm2[6],ymm6[7,8],ymm2[9],ymm6[10,11],ymm2[12],ymm6[13],ymm2[14],ymm6[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3],xmm1[4,5],xmm2[6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0]
+; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4],ymm11[5],ymm8[6,7],ymm11[8],ymm8[9,10],ymm11[11],ymm8[12],ymm11[13],ymm8[14,15]
; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm6
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7]
-; AVX2-FP-NEXT: vpshufb %ymm1, %ymm3, %ymm3
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2
-; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm9
-; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm10
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15]
+; AVX2-FP-NEXT: vmovdqa %ymm5, %ymm7
+; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm5
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4,5],xmm5[6,7]
+; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm12
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm12[1,2],ymm10[3],ymm12[4],ymm10[5],ymm12[6,7],ymm10[8],ymm12[9,10],ymm10[11],ymm12[12],ymm10[13],ymm12[14,15]
; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm14
+; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm8
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm14[1],ymm8[2,3],ymm14[4],ymm8[5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10,11],ymm14[12],ymm8[13],ymm14[14],ymm8[15]
+; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm5
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1,2,3],xmm2[4,5],xmm5[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm0
+; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm6
+; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm9
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15]
; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6],ymm6[7]
-; AVX2-FP-NEXT: vpshufb %ymm1, %ymm3, %ymm1
-; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm4
+; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6],ymm2[7]
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3
; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15]
; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm2
-; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
-; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-FP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm1, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15]
+; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm0 = mem[0],ymm15[1],mem[2,3],ymm15[4],mem[5],ymm15[6],mem[7,8],ymm15[9],mem[10,11],ymm15[12],mem[13],ymm15[14],mem[15]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm8
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6],xmm8[7]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm3
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm11
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm8
-; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm1
-; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm8, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm7
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm11
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm6
-; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm12
-; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm1
-; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm9
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0],xmm1[1],xmm9[2,3]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
-; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 624(%rdi), %xmm10
-; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm8
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm10[1],xmm8[2,3]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
-; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 464(%rdi), %xmm6
-; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm5
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm6[1],xmm5[2,3]
-; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
-; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm11
-; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm7
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0],xmm11[1],xmm7[2,3]
-; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm0
-; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm1 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25]
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm4
+; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm0 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
+; AVX2-FP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm2 = mem[0],ymm7[1],mem[2],ymm7[3],mem[4,5],ymm7[6],mem[7,8],ymm7[9],mem[10],ymm7[11],mem[12,13],ymm7[14],mem[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3],xmm2[4,5,6],xmm7[7]
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm2, %ymm0, %ymm11
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5],ymm10[6],ymm12[7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13],ymm10[14],ymm12[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm14[0],ymm8[1],ymm14[2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8],ymm8[9],ymm14[10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6],xmm8[7]
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm7
+; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm7, %ymm0, %ymm2
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5],ymm6[6],ymm9[7,8],ymm6[9],ymm9[10,11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm7, %ymm4
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm3[1],ymm5[2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10],ymm3[11],ymm5[12,13],ymm3[14],ymm5[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6],xmm8[7]
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm1, %ymm4, %ymm7
+; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm10
+; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm5
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0],xmm5[1],xmm10[2,3]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm12
+; AVX2-FP-NEXT: vmovdqa 624(%rdi), %xmm4
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm4[1],xmm12[2,3]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm6
+; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0,1,2,3,4],ymm6[5,6,7],ymm1[8,9,10,11,12],ymm6[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm9
+; AVX2-FP-NEXT: vmovdqa 464(%rdi), %xmm13
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm13[1],xmm9[2,3]
; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vmovdqa %xmm1, %xmm14
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm8
+; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm8
+; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm1
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm8[0],xmm1[1],xmm8[2,3]
+; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm1[2],xmm9[3]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm0
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3]
-; AVX2-FP-NEXT: vmovdqa %xmm10, %xmm13
+; AVX2-FP-NEXT: vmovdqa %xmm5, %xmm6
+; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vmovdqa %xmm10, %xmm14
; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm10
-; AVX2-FP-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm5[2],xmm10[3]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm4[2],xmm12[3]
+; AVX2-FP-NEXT: vmovdqa %xmm12, %xmm5
+; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3]
-; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm3
-; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm13[2],xmm9[3]
+; AVX2-FP-NEXT: vmovdqa %xmm13, (%rsp) # 16-byte Spill
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm11[2],xmm7[3]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm1[2],xmm8[3]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm1 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX2-FP-NEXT: vpshufb %ymm11, %ymm0, %ymm2
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5],ymm1[6],ymm12[7,8],ymm1[9],ymm12[10,11],ymm1[12],ymm12[13],ymm1[14],ymm12[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
+; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX2-FP-NEXT: vpshufb %ymm9, %ymm0, %ymm3
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0],xmm4[1],xmm14[2,3]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9
-; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7]
-; AVX2-FP-NEXT: vpshufb %ymm11, %ymm2, %ymm2
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm9
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm10[1],xmm13[2,3]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9
-; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0],xmm14[1],xmm6[2,3]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5],ymm15[6],ymm8[7,8],ymm15[9],ymm8[10,11],ymm15[12],ymm8[13],ymm15[14],ymm8[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0,1],ymm6[2],ymm11[3],ymm6[4],ymm11[5,6],ymm6[7],ymm11[8,9],ymm6[10],ymm11[11],ymm6[12],ymm11[13,14],ymm6[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm10
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm10[3,4],xmm7[5,6,7]
+; AVX2-FP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm7 = ymm9[0,1],mem[2],ymm9[3],mem[4],ymm9[5,6],mem[7],ymm9[8,9],mem[10],ymm9[11],mem[12],ymm9[13,14],mem[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm10
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm10[3,4],xmm7[5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm7 # 16-byte Folded Reload
+; AVX2-FP-NEXT: # xmm7 = xmm13[0],mem[1],xmm13[2,3]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm9
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0],xmm13[1],xmm3[2,3]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9
-; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7]
-; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm9
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4],xmm3[5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FP-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX2-FP-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm3 = mem[0],ymm10[1],mem[2,3],ymm10[4],mem[5],ymm10[6],mem[7,8],ymm10[9],mem[10,11],ymm10[12],mem[13],ymm10[14],mem[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6],ymm3[7]
+; AVX2-FP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm0
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX2-FP-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm12[1],ymm1[2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7,8],ymm12[9],ymm1[10],ymm12[11],ymm1[12,13],ymm12[14],ymm1[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm2 = ymm13[0],mem[1,2],ymm13[3],mem[4],ymm13[5],mem[6,7],ymm13[8],mem[9,10],ymm13[11],mem[12],ymm13[13],mem[14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FP-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX2-FP-NEXT: # xmm1 = mem[0,1],xmm1[2],mem[3]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2],mem[3],ymm8[4,5],mem[6],ymm8[7,8],mem[9],ymm8[10],mem[11],ymm8[12,13],mem[14],ymm8[15]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm8[1],ymm15[2],ymm8[3],ymm15[4,5],ymm8[6],ymm15[7,8],ymm8[9],ymm15[10],ymm8[11],ymm15[12,13],ymm8[14],ymm15[15]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm2
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FP-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload
-; AVX2-FP-NEXT: # xmm9 = xmm1[0,1],mem[2],xmm1[3]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9
-; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7]
-; AVX2-FP-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm9 = mem[0],ymm7[1,2],mem[3],ymm7[4],mem[5],ymm7[6,7],mem[8],ymm7[9,10],mem[11],ymm7[12],mem[13],ymm7[14,15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm12
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3]
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm9
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FP-NEXT: vpblendd $4, (%rsp), %xmm4, %xmm9 # 16-byte Folded Reload
-; AVX2-FP-NEXT: # xmm9 = xmm4[0,1],mem[2],xmm4[3]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9
-; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm10[1,2],ymm11[3],ymm10[4],ymm11[5],ymm10[6,7],ymm11[8],ymm10[9,10],ymm11[11],ymm10[12],ymm11[13],ymm10[14,15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm12
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3]
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm9
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3]
+; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm13[2],xmm15[3]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9
-; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7,8],ymm7[9],ymm12[10],ymm7[11],ymm12[12,13],ymm7[14],ymm12[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7]
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm9
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2],xmm3[3]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm5[2],xmm3[3]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX2-FP-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
+; AVX2-FP-NEXT: # xmm1 = xmm15[0,1],mem[2],xmm15[3]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7]
+; AVX2-FP-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm3 = mem[0],ymm9[1,2],mem[3],ymm9[4],mem[5],ymm9[6,7],mem[8],ymm9[9,10],mem[11],ymm9[12],mem[13],ymm9[14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
+; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm14 # 16-byte Reload
+; AVX2-FP-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload
+; AVX2-FP-NEXT: # xmm3 = xmm14[0,1],mem[2],xmm14[3]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm11
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
+; AVX2-FP-NEXT: vpshufb %ymm7, %ymm3, %ymm3
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm12[1,2],ymm1[3],ymm12[4],ymm1[5],ymm12[6,7],ymm1[8],ymm12[9,10],ymm1[11],ymm12[12],ymm1[13],ymm12[14,15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2],xmm4[3]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-FP-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
+; AVX2-FP-NEXT: # xmm3 = xmm0[0,1],mem[2],xmm0[3]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm4 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5],ymm13[6],mem[7,8],ymm13[9],mem[10,11],ymm13[12],mem[13],ymm13[14],mem[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
+; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
+; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX2-FP-NEXT: vpshufb %ymm2, %ymm3, %ymm3
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm6, %xmm7
+; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm8
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm8
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm7 = ymm7[0,1],mem[2],ymm7[3],mem[4],ymm7[5,6],mem[7],ymm7[8,9],mem[10],ymm7[11],mem[12],ymm7[13,14],mem[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6],ymm8[7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7]
+; AVX2-FP-NEXT: vpshufb %ymm2, %ymm7, %ymm7
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm8
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm15, %xmm8
+; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm9
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm9
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm7
+; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm9
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm9
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4],ymm9[5,6],ymm11[7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm1[1],ymm12[2,3],ymm1[4],ymm12[5],ymm1[6],ymm12[7,8],ymm1[9],ymm12[10,11],ymm1[12],ymm12[13],ymm1[14],ymm12[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4],xmm12[5,6,7]
+; AVX2-FP-NEXT: vpshufb %ymm2, %ymm9, %ymm9
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm11, %xmm11
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6,7]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
-; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15]
-; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm1
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm4
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm6
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm14
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3,4],xmm14[5,6,7]
-; AVX2-FP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
-; AVX2-FP-NEXT: vpshufb %xmm13, %xmm4, %xmm4
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm4
-; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm8, %xmm8
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm4
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm5
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm7[2],ymm12[3],ymm7[4],ymm12[5,6],ymm7[7],ymm12[8,9],ymm7[10],ymm12[11],ymm7[12],ymm12[13,14],ymm7[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
-; AVX2-FP-NEXT: vpshufb %ymm0, %ymm5, %ymm5
-; AVX2-FP-NEXT: vpshufb %xmm13, %xmm7, %xmm7
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm5 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7]
-; AVX2-FP-NEXT: vpshufb %ymm0, %ymm5, %ymm0
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm13, %xmm5, %xmm3
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm15, %xmm3
-; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm1
-; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r8)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, (%r8)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r8)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8)
-; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm4, (%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm2, 96(%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%r9)
-; AVX2-FP-NEXT: addq $1080, %rsp # imm = 0x438
+; AVX2-FP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4],ymm9[5,6],ymm11[7]
+; AVX2-FP-NEXT: vpshufb %ymm2, %ymm9, %ymm9
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm10 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm10, %xmm4
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm14, %xmm5
+; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm9
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm6
+; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, (%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rdx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, (%rdx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rdx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rdx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rcx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rcx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rcx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, 64(%r8)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, (%r8)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, 96(%r8)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm0, 32(%r8)
+; AVX2-FP-NEXT: vmovdqa %ymm4, 64(%r9)
+; AVX2-FP-NEXT: vmovdqa %ymm7, (%r9)
+; AVX2-FP-NEXT: vmovdqa %ymm8, 96(%r9)
+; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%r9)
+; AVX2-FP-NEXT: addq $984, %rsp # imm = 0x3D8
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i16_stride5_vf64:
; AVX2-FCP: # %bb.0:
; AVX2-FCP-NEXT: subq $1000, %rsp # imm = 0x3E8
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm9
-; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm14
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm10
+; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm6
+; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm11
-; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm8
-; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm10
-; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm4
-; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm13
+; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm12
+; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm3
; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm4
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
@@ -7502,395 +7569,396 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7]
; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3]
; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm4
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19]
; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4
-; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0]
-; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2
+; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0]
+; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10,11],ymm13[12],ymm12[13],ymm13[14],ymm12[15]
+; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm11[1,2],ymm14[3],ymm11[4],ymm14[5],ymm11[6,7],ymm14[8],ymm11[9,10],ymm14[11],ymm11[12],ymm14[13],ymm11[14,15]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15]
; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4
; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4
-; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2
+; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm15
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm15[1],ymm9[2,3],ymm15[4],ymm9[5],ymm15[6],ymm9[7,8],ymm15[9],ymm9[10,11],ymm15[12],ymm9[13],ymm15[14],ymm9[15]
-; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm8
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm9
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
+; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm8
-; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm12
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4],ymm8[5],ymm12[6,7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12],ymm8[13],ymm12[14,15]
-; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm7
+; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3],ymm4[4],ymm7[5],ymm4[6,7],ymm7[8],ymm4[9,10],ymm7[11],ymm4[12],ymm7[13],ymm4[14,15]
+; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4
; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4
; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2
-; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm9
-; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
-; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm5
+; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm14
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm5[1],ymm14[2,3],ymm5[4],ymm14[5],ymm5[6],ymm14[7,8],ymm5[9],ymm14[10,11],ymm5[12],ymm14[13],ymm5[14],ymm14[15]
+; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm5
-; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm10
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15]
-; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm15
+; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm6
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm6[1,2],ymm15[3],ymm6[4],ymm15[5],ymm6[6,7],ymm15[8],ymm6[9,10],ymm15[11],ymm6[12],ymm15[13],ymm6[14,15]
+; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1
; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0
+; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpblendw $181, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15]
+; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,0,0,0,4,7,1,6]
; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm6
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15]
+; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm0
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6],xmm3[7]
; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5],ymm14[6],ymm11[7,8],ymm14[9],ymm11[10,11],ymm14[12],ymm11[13],ymm14[14],ymm11[15]
+; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm3 = ymm11[0],mem[1],ymm11[2,3],mem[4],ymm11[5],mem[6],ymm11[7,8],mem[9],ymm11[10,11],mem[12],ymm11[13],mem[14],ymm11[15]
; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm3
; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15]
+; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5,6],xmm11[7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10,11],ymm8[12],ymm12[13],ymm8[14],ymm12[15]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm11
; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm11
; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm11, %ymm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
+; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm0, %ymm11, %ymm0
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0],ymm14[1],ymm5[2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7,8],ymm14[9],ymm5[10],ymm14[11],ymm5[12,13],ymm14[14],ymm5[15]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6],xmm12[7]
; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm4
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13],ymm5[14],ymm10[15]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10,11],ymm15[12],ymm6[13],ymm15[14],ymm6[15]
; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm2
; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm4, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm12
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,1,3,0,3,5,7]
-; AVX2-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm11
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
-; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm10
-; AVX2-FCP-NEXT: vpermd %ymm10, %ymm5, %ymm11
-; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm9
-; AVX2-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm11
-; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm5
+; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,1,3,0,3,5,7]
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm11, %ymm4
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm4[5,6,7],ymm1[8,9,10,11,12],ymm4[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm13
+; AVX2-FCP-NEXT: vpermd %ymm13, %ymm11, %ymm10
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm10
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm11, %ymm10
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm10
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm4
-; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm5
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm11, %ymm10
; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,2,3,1,3,6,7]
-; AVX2-FCP-NEXT: vpermd %ymm12, %ymm7, %ymm11
-; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm8
-; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
-; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm10
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7]
+; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm10
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm6
-; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm12
-; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm8, %ymm11
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2,3,4],ymm11[5,6,7],ymm1[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd %ymm13, %ymm8, %ymm6
+; AVX2-FCP-NEXT: vmovdqa %ymm13, %ymm7
+; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm6
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermd %ymm9, %ymm7, %ymm3
-; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm5
-; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm3
+; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm2
+; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm0
-; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm0
+; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3],ymm4[4],ymm9[5,6],ymm4[7],ymm9[8,9],ymm4[10],ymm9[11],ymm4[12],ymm9[13,14],ymm4[15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
+; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3,4],xmm0[5,6,7]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm6
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,0,0,5,7,2,4]
-; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm7
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,4,6,0,1,4,6,0]
-; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermd %ymm8, %ymm6, %ymm11
-; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm8
-; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm5
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
+; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm6 = ymm15[0],mem[1],ymm15[2,3],mem[4],ymm15[5],mem[6],ymm15[7,8],mem[9],ymm15[10,11],mem[12],ymm15[13],mem[14],ymm15[15]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,0,0,5,7,2,4]
+; AVX2-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm8
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0]
+; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1]
+; AVX2-FCP-NEXT: vpermd %ymm10, %ymm8, %ymm11
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27]
+; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2,3,4],ymm11[5,6,7],ymm5[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm5 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7],ymm14[8,9],mem[10],ymm14[11],mem[12],ymm14[13,14],mem[15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm11
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm11 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15]
+; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm11
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm11
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm7, %ymm8, %ymm11
+; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2,3,4],ymm11[5,6,7],ymm5[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm11 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15]
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm11
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpermd %ymm12, %ymm6, %ymm11
-; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm13
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0,1],ymm12[2],ymm10[3],ymm12[4],ymm10[5,6],ymm12[7],ymm10[8,9],ymm12[10],ymm10[11],ymm12[12],ymm10[13,14],ymm12[15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm11
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10,11],ymm9[12],ymm1[13],ymm9[14],ymm1[15]
+; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm11
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm11
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm8, %ymm11
+; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm5[0,1,2,3,4],ymm11[5,6,7],ymm5[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm11[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm7 = mem[0,1],ymm12[2],mem[3],ymm12[4],mem[5,6],ymm12[7],mem[8,9],ymm12[10],mem[11],ymm12[12],mem[13,14],ymm12[15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5],ymm8[6],ymm2[7,8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13],ymm8[14],ymm2[15]
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm11
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm11
-; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm7 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm0
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm7 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15]
-; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4],ymm4[5],ymm9[6,7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12],ymm4[13],ymm9[14,15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = ymm11[0],mem[1],ymm11[2],mem[3],ymm11[4,5],mem[6],ymm11[7,8],mem[9],ymm11[10],mem[11],ymm11[12,13],mem[14],ymm11[15]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,0,0,5,0,2,7]
-; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm6
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm6
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7]
-; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0]
-; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
-; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm7
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm5 = ymm9[0,1],mem[2],ymm9[3],mem[4],ymm9[5,6],mem[7],ymm9[8,9],mem[10],ymm9[11],mem[12],ymm9[13,14],mem[15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm11
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15]
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3]
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15]
-; AVX2-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm7
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm7
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,1,6,3]
+; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4],mem[5],ymm0[6,7],mem[8],ymm0[9,10],mem[11],ymm0[12],mem[13],ymm0[14,15]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm5
+; AVX2-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm6 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7]
+; AVX2-FCP-NEXT: vpermd %ymm6, %ymm15, %ymm8
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,4,7,0,2,4,7,0]
+; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25]
+; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm11
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0,1,2,3,4],ymm11[5,6,7],ymm8[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm8 = mem[0],ymm14[1,2],mem[3],ymm14[4],mem[5],ymm14[6,7],mem[8],ymm14[9,10],mem[11],ymm14[12],mem[13],ymm14[14,15]
+; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm8
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8
+; AVX2-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm11 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15]
+; AVX2-FCP-NEXT: vpermd %ymm11, %ymm15, %ymm11
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm11
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm11
+; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm13
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0,1,2,3,4],ymm11[5,6,7],ymm8[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm11[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm12[0],mem[1,2],ymm12[3],mem[4],ymm12[5],mem[6,7],ymm12[8],mem[9,10],ymm12[11],mem[12],ymm12[13],mem[14,15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
-; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm12
-; AVX2-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm7
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-FCP-NEXT: vpermd %ymm8, %ymm6, %ymm7
-; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm7
-; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm1[1],ymm14[2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7,8],ymm1[9],ymm14[10],ymm1[11],ymm14[12,13],ymm1[14],ymm14[15]
+; AVX2-FCP-NEXT: vpermd %ymm8, %ymm15, %ymm8
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm8
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm10[1,2],ymm12[3],ymm10[4],ymm12[5],ymm10[6,7],ymm12[8],ymm10[9,10],ymm12[11],ymm10[12],ymm12[13],ymm10[14,15]
+; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm11
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm11
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm11, %ymm11
+; AVX2-FCP-NEXT: vmovdqa %ymm13, %ymm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm8[0,1,2,3,4],ymm11[5,6,7],ymm8[8,9,10,11,12],ymm11[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm11[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4],ymm13[5],ymm10[6,7],ymm13[8],ymm10[9,10],ymm13[11],ymm10[12],ymm13[13],ymm10[14,15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3]
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
-; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm1
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT: vpermd %ymm14, %ymm6, %ymm1
-; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm9[1,2],ymm13[3],ymm9[4],ymm13[5],ymm9[6,7],ymm13[8],ymm9[9,10],ymm13[11],ymm9[12],ymm13[13],ymm9[14,15]
+; AVX2-FCP-NEXT: vpermd %ymm8, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm4[1],ymm7[2],ymm4[3],ymm7[4,5],ymm4[6],ymm7[7,8],ymm4[9],ymm7[10],ymm4[11],ymm7[12,13],ymm4[14],ymm7[15]
+; AVX2-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm0
+; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm2
+; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7]
-; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm2 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7],mem[8,9],ymm1[10],mem[11],ymm1[12],mem[13,14],ymm1[15]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,0,0,6,0,3,5]
; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2
-; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
-; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
+; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1]
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,1,3,0,2,5,7]
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm11
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4],xmm11[5,6,7]
-; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm11 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,2,1,3,0,2,5,7]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
+; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm11
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3,4],xmm11[5,6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm11 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX2-FCP-NEXT: vpermd %ymm11, %ymm5, %ymm11
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm9
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
-; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm9 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm11 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15]
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm11
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm4 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7],ymm14[8,9],mem[10],ymm14[11],mem[12],ymm14[13,14],mem[15]
+; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm11 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4],xmm12[5,6,7]
-; AVX2-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm9
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4
; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm11
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm8
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm8
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
-; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm9 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3,4],xmm11[5,6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6
-; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm9 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15]
-; AVX2-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm5
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm7
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7]
+; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm7 = mem[0],ymm13[1],mem[2,3],ymm13[4],mem[5],ymm13[6],mem[7,8],ymm13[9],mem[10,11],ymm13[12],mem[13],ymm13[14],mem[15]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3,4],xmm11[5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm7 = ymm7[0,1],mem[2],ymm7[3],mem[4],ymm7[5,6],mem[7],ymm7[8,9],mem[10],ymm7[11],mem[12],ymm7[13,14],mem[15]
+; AVX2-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm5
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm5
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpermd %ymm14, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm6
+; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm0
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi)
-; AVX2-FCP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rdx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rdx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rcx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rcx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx)
-; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%r8)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rsi)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, (%rsi)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rsi)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rsi)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rdx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rdx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rdx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rcx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, (%rcx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rcx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rcx)
+; AVX2-FCP-NEXT: vmovdqa %ymm3, 64(%r8)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm3, (%r8)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r8)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r8)
; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r9)
-; AVX2-FCP-NEXT: vmovdqa %ymm8, (%r9)
-; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%r9)
+; AVX2-FCP-NEXT: vmovdqa %ymm4, (%r9)
+; AVX2-FCP-NEXT: vmovdqa %ymm1, 96(%r9)
; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%r9)
; AVX2-FCP-NEXT: addq $1000, %rsp # imm = 0x3E8
; AVX2-FCP-NEXT: vzeroupper
@@ -7898,35 +7966,35 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-LABEL: load_i16_stride5_vf64:
; AVX512: # %bb.0:
-; AVX512-NEXT: subq $552, %rsp # imm = 0x228
-; AVX512-NEXT: vmovdqa 384(%rdi), %ymm6
-; AVX512-NEXT: vmovdqa 416(%rdi), %ymm11
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15]
-; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: subq $344, %rsp # imm = 0x158
+; AVX512-NEXT: vmovdqa 384(%rdi), %ymm9
+; AVX512-NEXT: vmovdqa 416(%rdi), %ymm8
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4],ymm9[5],ymm8[6,7],ymm9[8],ymm8[9,10],ymm9[11],ymm8[12],ymm9[13],ymm8[14,15]
+; AVX512-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128]
; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2
-; AVX512-NEXT: vmovdqa 352(%rdi), %ymm8
-; AVX512-NEXT: vmovdqa 320(%rdi), %ymm7
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15]
-; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa 352(%rdi), %ymm11
+; AVX512-NEXT: vmovdqa 320(%rdi), %ymm10
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
+; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3
-; AVX512-NEXT: vporq %ymm2, %ymm3, %ymm19
-; AVX512-NEXT: vmovdqa 192(%rdi), %ymm15
-; AVX512-NEXT: vmovdqa 224(%rdi), %ymm13
-; AVX512-NEXT: vmovdqa 176(%rdi), %xmm12
-; AVX512-NEXT: vmovdqa 160(%rdi), %xmm14
+; AVX512-NEXT: vporq %ymm2, %ymm3, %ymm24
+; AVX512-NEXT: vmovdqa 192(%rdi), %ymm13
+; AVX512-NEXT: vmovdqa 224(%rdi), %ymm12
+; AVX512-NEXT: vmovdqa 176(%rdi), %xmm14
+; AVX512-NEXT: vmovdqa64 160(%rdi), %xmm16
; AVX512-NEXT: vmovdqa (%rdi), %ymm4
; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX512-NEXT: vmovdqa 64(%rdi), %ymm10
-; AVX512-NEXT: vmovdqa 96(%rdi), %ymm9
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15]
+; AVX512-NEXT: vmovdqa 64(%rdi), %ymm7
+; AVX512-NEXT: vmovdqa 96(%rdi), %ymm6
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15]
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0
@@ -7934,64 +8002,100 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10,11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
+; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm23
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128]
; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3
; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15]
-; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15]
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0
; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
+; AVX512-NEXT: vmovdqa 128(%rdi), %xmm15
; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm28
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3],ymm15[4],ymm13[5,6],ymm15[7],ymm13[8,9],ymm15[10],ymm13[11],ymm15[12],ymm13[13,14],ymm15[15]
-; AVX512-NEXT: vmovdqa64 %ymm15, %ymm18
-; AVX512-NEXT: vmovdqa64 %ymm13, %ymm24
+; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm21
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
+; AVX512-NEXT: vmovdqa64 %xmm16, %xmm8
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm16[0,1,1,3]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vmovdqa 144(%rdi), %xmm10
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0],xmm15[1],xmm10[2,3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
+; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-NEXT: vmovdqa 256(%rdi), %ymm11
+; AVX512-NEXT: vmovdqa 288(%rdi), %ymm9
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm29 & (zmm1 ^ zmm0))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15]
+; AVX512-NEXT: vmovdqa64 %ymm12, %ymm22
+; AVX512-NEXT: vmovdqa64 %ymm13, %ymm19
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm14[1],xmm12[2,3]
-; AVX512-NEXT: vmovdqa64 %xmm12, %xmm16
-; AVX512-NEXT: vmovdqa64 %xmm14, %xmm30
-; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0],xmm8[1],xmm14[2,3]
+; AVX512-NEXT: vmovdqa64 %xmm16, %xmm27
+; AVX512-NEXT: vmovdqa64 %xmm14, %xmm20
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vmovdqa %ymm2, %ymm8
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa 144(%rdi), %xmm11
-; AVX512-NEXT: vmovdqa 128(%rdi), %xmm7
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm15[2],xmm10[3]
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15]
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
-; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX512-NEXT: vpshufb %ymm8, %ymm1, %ymm1
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512-NEXT: vmovdqa 256(%rdi), %ymm12
-; AVX512-NEXT: vmovdqa 288(%rdi), %ymm15
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15]
+; AVX512-NEXT: vmovdqa64 %ymm11, %ymm28
+; AVX512-NEXT: vmovdqa64 %ymm9, %ymm16
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0))
@@ -8002,853 +8106,754 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 464(%rdi), %xmm8
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX512-NEXT: vmovdqa 448(%rdi), %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa %xmm11, %xmm6
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX512-NEXT: vmovdqa %xmm7, %xmm9
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm8[1],xmm3[2,3]
-; AVX512-NEXT: vmovdqa64 %xmm3, %xmm22
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
-; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm23
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10
-; AVX512-NEXT: vmovdqa 576(%rdi), %ymm1
-; AVX512-NEXT: vmovdqa 608(%rdi), %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15]
-; AVX512-NEXT: vmovdqa64 %ymm2, %ymm20
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
-; AVX512-NEXT: vmovdqa 512(%rdi), %ymm5
-; AVX512-NEXT: vmovdqa 544(%rdi), %ymm13
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10,11],ymm5[12],ymm13[13],ymm5[14],ymm13[15]
+; AVX512-NEXT: vmovdqa 464(%rdi), %xmm12
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,1,4,5,6,7]
+; AVX512-NEXT: vmovdqa 448(%rdi), %xmm9
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm0, %xmm9, %xmm2
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpshufb %xmm0, %xmm15, %xmm0
+; AVX512-NEXT: vmovdqa %xmm15, %xmm7
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm31 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512-NEXT: vmovdqa 512(%rdi), %ymm6
+; AVX512-NEXT: vmovdqa 544(%rdi), %ymm5
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 496(%rdi), %xmm21
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa 480(%rdi), %xmm7
-; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
-; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm11
-; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7]
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm19))
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7]
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %ymm18, %ymm3
-; AVX512-NEXT: vmovdqa64 %ymm24, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15]
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm10
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1,2],ymm10[3],ymm2[4,5,6,7]
-; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm16[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm30, %xmm24
-; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,2,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm6[1],xmm9[2,3]
-; AVX512-NEXT: vmovdqa64 %xmm9, %xmm25
-; AVX512-NEXT: vmovdqa64 %xmm23, %xmm4
-; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem))
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm4
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm8[2],xmm4[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15]
-; AVX512-NEXT: vmovdqa64 %ymm13, %ymm26
-; AVX512-NEXT: vmovdqa64 %ymm5, %ymm22
-; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,3]
-; AVX512-NEXT: vmovdqa64 %xmm7, %xmm30
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7]
-; AVX512-NEXT: vpsrlq $48, %xmm21, %xmm13
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
-; AVX512-NEXT: vpshufb %ymm13, %ymm10, %ymm10
-; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm20, %ymm9
-; AVX512-NEXT: vmovdqa64 %ymm17, %ymm7
-; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15]
-; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm14
-; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9]
-; AVX512-NEXT: vpshufb %xmm14, %xmm11, %xmm11
-; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7]
-; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem))
-; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2
-; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15]
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm10
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5,6,7]
-; AVX512-NEXT: vpshufb %ymm13, %ymm2, %ymm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm24[0,3,2,3]
-; AVX512-NEXT: vmovdqa64 %xmm24, %xmm20
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,2,2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm16, %xmm1
-; AVX512-NEXT: vpsrlq $48, %xmm16, %xmm11
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15]
-; AVX512-NEXT: vmovdqa64 %ymm12, %ymm19
-; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3]
-; AVX512-NEXT: vpshufb %xmm14, %xmm10, %xmm10
-; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm5
-; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1],xmm6[2],xmm5[3]
-; AVX512-NEXT: vmovdqa64 %xmm6, %xmm23
-; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 496(%rdi), %xmm18
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm18[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa64 480(%rdi), %xmm17
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm17[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
+; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm12[1],xmm9[2,3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm26
+; AVX512-NEXT: vmovdqa 576(%rdi), %ymm14
+; AVX512-NEXT: vmovdqa 608(%rdi), %ymm13
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3],ymm14[4],ymm13[5,6],ymm14[7],ymm13[8,9],ymm14[10],ymm13[11],ymm14[12],ymm13[13,14],ymm14[15]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm11
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm11[3,4],xmm0[5,6,7]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm29 & (zmm26 ^ zmm24))
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm27 & (zmm0 ^ zmm28))
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vextracti64x4 $1, %zmm26, %ymm15
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm30
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm4
+; AVX512-NEXT: vmovdqa64 %ymm22, %ymm1
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm15
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa64 %xmm20, %xmm26
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm27[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0],xmm10[1],xmm7[2,3]
+; AVX512-NEXT: vmovdqa64 %xmm7, %xmm22
+; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm7
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0
+; AVX512-NEXT: vmovdqa64 %ymm28, %ymm3
+; AVX512-NEXT: vmovdqa64 %ymm16, %ymm2
+; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15]
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
+; AVX512-NEXT: vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm0 ^ (zmm29 & (zmm23 ^ zmm0))
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512-NEXT: vextracti64x4 $1, %zmm23, %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm23, %zmm28
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm17[0,3,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7]
+; AVX512-NEXT: vpsrlq $48, %xmm18, %xmm8
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
+; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm12[2],xmm9[3]
+; AVX512-NEXT: vmovdqa64 %xmm12, %xmm16
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX512-NEXT: vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0
+; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4],ymm14[5],ymm13[6,7],ymm14[8],ymm13[9,10],ymm14[11],ymm13[12],ymm14[13],ymm13[14,15]
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm15
+; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0],xmm7[1],xmm15[2],xmm7[3]
+; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ mem))
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9]
+; AVX512-NEXT: vpshufb %xmm15, %xmm7, %xmm7
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm12
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4],ymm7[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm25
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm20
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm27[0,3,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7]
+; AVX512-NEXT: vpsrlq $48, %xmm26, %xmm8
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %xmm22, %xmm1
+; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm1[0,1],xmm10[2],xmm1[3]
+; AVX512-NEXT: vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0
+; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm22
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm23
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3]
+; AVX512-NEXT: vpshufb %xmm15, %xmm7, %xmm7
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm0 ^ (zmm29 & (zmm21 ^ zmm0))
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512-NEXT: vextracti64x4 $1, %zmm21, %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm24
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6],ymm0[7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
-; AVX512-NEXT: vmovdqa64 %ymm18, %ymm0
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10,11],ymm12[12],ymm0[13],ymm12[14],ymm0[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512-NEXT: vmovdqa64 %ymm24, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm16, %ymm5
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3],ymm5[4],ymm0[5,6],ymm5[7],ymm0[8,9],ymm5[10],ymm0[11],ymm5[12],ymm0[13,14],ymm5[15]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10
-; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm8[0],xmm4[1],xmm8[2,3]
-; AVX512-NEXT: vmovdqa64 %xmm4, %xmm29
-; AVX512-NEXT: vmovdqa64 %xmm8, %xmm28
-; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm27 & (zmm13 ^ zmm2))
-; AVX512-NEXT: vmovdqa64 %ymm26, %ymm8
-; AVX512-NEXT: vmovdqa64 %ymm22, %ymm4
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15]
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm14
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
-; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa64 %xmm30, %xmm6
-; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,1,1,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm5
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm21[2],xmm11[3],xmm21[3]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm2[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
-; AVX512-NEXT: vmovdqa64 %ymm7, %ymm21
-; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm10
-; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512-NEXT: vpshufb %xmm11, %xmm10, %xmm10
-; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2
-; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm17, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm22
-; AVX512-NEXT: vmovdqa64 %ymm17, %ymm26
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm10
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6,7]
-; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm20[0,1,1,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm17
-; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm19, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15]
-; AVX512-NEXT: vmovdqa64 %ymm15, %ymm31
-; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm13
-; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3,4],xmm13[5,6,7]
-; AVX512-NEXT: vpshufb %xmm11, %xmm10, %xmm10
-; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT: vpblendw $82, (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm10 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7]
-; AVX512-NEXT: vpshufb %ymm0, %ymm10, %ymm0
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm10 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15]
-; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm23, %xmm3
-; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm0))
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm23
-; AVX512-NEXT: vmovdqa64 %ymm18, %ymm13
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4],ymm0[5],ymm10[6],ymm0[7]
-; AVX512-NEXT: vmovdqa64 %ymm16, %ymm14
-; AVX512-NEXT: vmovdqa64 %ymm24, %ymm15
-; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15]
-; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15]
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15]
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
+; AVX512-NEXT: vmovdqa64 %xmm17, %xmm4
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm17[0,1,1,3]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
+; AVX512-NEXT: vmovdqa64 %xmm18, %xmm3
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm18[2],xmm8[3],xmm18[3]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm7[3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %xmm16, %xmm2
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0],xmm9[1],xmm2[2,3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm29 & (zmm7 ^ zmm0))
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm8
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4],xmm8[5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm8
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm21
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15]
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX512-NEXT: vmovdqa %ymm4, %ymm3
-; AVX512-NEXT: vmovdqa %ymm8, %ymm4
-; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1],ymm3[2],ymm8[3],ymm3[4],ymm8[5,6],ymm3[7],ymm8[8,9],ymm3[10],ymm8[11],ymm3[12],ymm8[13,14],ymm3[15]
-; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4,5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm5[0],xmm6[1],xmm5[2,3]
-; AVX512-NEXT: vmovdqa64 %xmm30, %xmm7
-; AVX512-NEXT: vmovdqa %xmm5, %xmm6
-; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm11
-; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm29, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm28, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2],xmm2[3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15]
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0],xmm4[1],xmm3[2,3]
+; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm8
+; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm7[3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm9[2],xmm2[3]
; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm0))
-; AVX512-NEXT: vmovdqa %ymm9, %ymm2
-; AVX512-NEXT: vmovdqa64 %ymm21, %ymm10
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm9
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3],xmm0[4,5],xmm9[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm0))
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm8
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1,2,3],xmm0[4,5],xmm8[6,7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm8, %ymm9
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm25
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4],ymm0[5,6],ymm8[7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
-; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
-; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX512-NEXT: vpshufb %xmm8, %xmm11, %xmm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
-; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm7[2],xmm6[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
-; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm4
-; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1
+; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm8
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm8
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4],ymm0[5,6],ymm7[7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13],ymm12[14],ymm15[15]
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm7[3,4],xmm9[5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
+; AVX512-NEXT: vpshufb %ymm7, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX512-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15]
+; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm4[2],xmm3[3]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
+; AVX512-NEXT: vpshufb %ymm4, %ymm5, %ymm5
+; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2
; AVX512-NEXT: movb $7, %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8],ymm10[9],ymm2[10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm5
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6],xmm5[7]
+; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1}
+; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm5
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512-NEXT: vmovdqa64 %ymm22, %ymm2
-; AVX512-NEXT: vmovdqa64 %ymm26, %ymm4
-; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15]
-; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7]
-; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm2
-; AVX512-NEXT: vmovdqa64 %xmm17, %xmm4
-; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3]
-; AVX512-NEXT: vpshufb %xmm9, %xmm4, %xmm4
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-NEXT: vpblendw $107, (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm4 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX512-NEXT: vpshufb %ymm9, %ymm4, %ymm4
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm5 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15]
-; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
-; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1}
-; AVX512-NEXT: vmovdqa64 %ymm19, %ymm2
-; AVX512-NEXT: vmovdqa64 %ymm31, %ymm4
-; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15]
+; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512-NEXT: vmovdqa64 %ymm20, %ymm2
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm3
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7]
+; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3]
+; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm3
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm3
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15]
; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7]
-; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
+; AVX512-NEXT: vpshufb %xmm10, %xmm4, %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k1}
+; AVX512-NEXT: vmovdqa64 %ymm22, %ymm3
+; AVX512-NEXT: vmovdqa64 %ymm23, %ymm4
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
+; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
-; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vmovaps %zmm2, (%rsi)
-; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vmovaps %zmm2, 64(%rsi)
-; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vmovaps %zmm2, 64(%rdx)
-; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vmovaps %zmm2, (%rdx)
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512-NEXT: vmovdqa64 %zmm28, (%rsi)
+; AVX512-NEXT: vmovdqa64 %zmm30, 64(%rsi)
+; AVX512-NEXT: vmovdqa64 %zmm25, 64(%rdx)
+; AVX512-NEXT: vmovdqa64 %zmm24, (%rdx)
+; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rcx)
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vmovaps %zmm2, 64(%rcx)
-; AVX512-NEXT: vmovdqa64 %zmm23, (%rcx)
-; AVX512-NEXT: vmovdqa64 %zmm25, 64(%r8)
+; AVX512-NEXT: vmovaps %zmm2, (%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm8, 64(%r8)
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512-NEXT: vmovaps %zmm2, (%r8)
; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r9)
; AVX512-NEXT: vmovdqa64 %zmm0, (%r9)
-; AVX512-NEXT: addq $552, %rsp # imm = 0x228
+; AVX512-NEXT: addq $344, %rsp # imm = 0x158
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride5_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $552, %rsp # imm = 0x228
+; AVX512-FCP-NEXT: subq $648, %rsp # imm = 0x288
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vmovdqa 496(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %xmm2
-; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm10
-; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 544(%rdi), %ymm11
-; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm7
-; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm8
-; AVX512-FCP-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm4
-; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm28
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm30
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm6
-; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm26
-; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm27
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3]
-; AVX512-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0]
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm7
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6]
-; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm17, %ymm7
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7]
-; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm7
-; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4
-; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm4
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm31
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm15
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm11
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm7
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4],ymm11[5],ymm7[6,7],ymm11[8],ymm7[9,10],ymm11[11],ymm7[12],ymm11[13],ymm7[14,15]
-; AVX512-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm12
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5],ymm6[6],ymm12[7,8],ymm6[9],ymm12[10,11],ymm6[12],ymm12[13],ymm6[14],ymm12[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; AVX512-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
-; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10,11],ymm8[12],ymm13[13],ymm8[14],ymm13[15]
-; AVX512-FCP-NEXT: vpermd %ymm3, %ymm10, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm5
-; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm5[2],ymm9[3],ymm5[4],ymm9[5,6],ymm5[7],ymm9[8,9],ymm5[10],ymm9[11],ymm5[12],ymm9[13,14],ymm5[15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm25
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm17, %ymm3
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
-; AVX512-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm21
-; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm29
-; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm19
-; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm17, %ymm1
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vmovdqa 496(%rdi), %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm28
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm3
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm29
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm4
+; AVX512-FCP-NEXT: vmovdqa 544(%rdi), %ymm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm17
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm27
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [2,4,7,1,4,6,0,0]
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm20, %ymm3
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23]
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [8,9,3,2,4,5,7,6]
+; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm14, %ymm3
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,1,3,0,3,5,7]
+; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %ymm21
+; AVX512-FCP-NEXT: vpermd %ymm21, %ymm15, %ymm1
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpor %ymm1, %ymm0, %ymm10
-; AVX512-FCP-NEXT: vpsrlq $48, %xmm31, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15]
-; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm5
-; AVX512-FCP-NEXT: vpermd %ymm0, %ymm20, %ymm4
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3]
-; AVX512-FCP-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm10
+; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15]
; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm12
-; AVX512-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15]
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm24, %ymm0
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7]
-; AVX512-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm10
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0
+; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm7
+; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
+; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,0,2,4,6,1,3]
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm16, %ymm11
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm11
+; AVX512-FCP-NEXT: vpor %ymm1, %ymm11, %ymm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm18
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm12
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm13
+; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm11
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10,11],ymm13[12],ymm11[13],ymm13[14],ymm11[15]
+; AVX512-FCP-NEXT: vpermd %ymm2, %ymm20, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm14, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm20
+; AVX512-FCP-NEXT: vpermd %ymm20, %ymm15, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm30
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm18
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm28
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15]
-; AVX512-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15]
-; AVX512-FCP-NEXT: vpermd %ymm4, %ymm20, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2
-; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
-; AVX512-FCP-NEXT: vpsrlq $48, %xmm27, %xmm4
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15]
-; AVX512-FCP-NEXT: vpermd %ymm3, %ymm24, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
-; AVX512-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm14[1,2],ymm6[3],ymm14[4],ymm6[5],ymm14[6,7],ymm6[8],ymm14[9,10],ymm6[11],ymm14[12],ymm6[13],ymm14[14,15]
+; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermd %ymm0, %ymm16, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15]
+; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,0,0,0,4,7,1,6]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15]
+; AVX512-FCP-NEXT: vpermd %ymm0, %ymm16, %ymm0
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vporq %ymm4, %ymm2, %ymm22
+; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm6
+; AVX512-FCP-NEXT: vpsrlq $48, %xmm18, %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm4
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,5,7,4,7,0,0]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5],ymm11[6],ymm13[7,8],ymm11[9],ymm13[10,11],ymm11[12],ymm13[13],ymm11[14],ymm13[15]
+; AVX512-FCP-NEXT: vpermd %ymm4, %ymm25, %ymm0
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [1,3,2,3,1,3,6,7]
+; AVX512-FCP-NEXT: vpermd %ymm20, %ymm26, %ymm0
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm20
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm30
-; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm26
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm3
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm29
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm21
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm25, %ymm3
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm16, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vporq %ymm1, %ymm0, %ymm18
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm8
+; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm8[1],ymm14[2,3],ymm8[4],ymm14[5],ymm8[6],ymm14[7,8],ymm8[9],ymm14[10,11],ymm8[12],ymm14[13],ymm8[14],ymm14[15]
+; AVX512-FCP-NEXT: vpermd %ymm0, %ymm25, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm10
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm9
+; AVX512-FCP-NEXT: vpsrlq $48, %xmm28, %xmm3
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpermd %ymm21, %ymm26, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm25
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [4,5,14,15,4,5,14,15,4,5,14,15,4,5,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm1
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm7
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,3,5,2,5,7,0,0]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm13[1],ymm11[2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10],ymm13[11],ymm11[12,13],ymm13[14],ymm11[15]
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm16, %ymm5
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0]
; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm23, %ymm5, %ymm14
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm14
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm0))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0
+; AVX512-FCP-NEXT: vpermd %ymm20, %ymm5, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15]
+; AVX512-FCP-NEXT: vmovdqu64 %ymm27, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm16, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0
+; AVX512-FCP-NEXT: vmovdqa64 %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm28[2],xmm0[3],xmm28[3]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpermd %ymm21, %ymm5, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15]
-; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm13
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm14
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm11
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm12
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm3 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm16
-; AVX512-FCP-NEXT: vpermd %ymm3, %ymm25, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28
-; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm15
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0],xmm12[1],xmm7[2,3]
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm9
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15]
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm18, %ymm4
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,6,0,5,0,0,0]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0,1],ymm13[2],ymm11[3],ymm13[4],ymm11[5,6],ymm13[7],ymm11[8,9],ymm13[10],ymm11[11],ymm13[12],ymm11[13,14],ymm13[15]
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm4
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0]
; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm23, %ymm4, %ymm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm22
-; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm31
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm20
-; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm30
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm18, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
+; AVX512-FCP-NEXT: vpermd %ymm20, %ymm4, %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm26
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm10[1],xmm9[2,3]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm14[2],ymm8[3],ymm14[4],ymm8[5,6],ymm14[7],ymm8[8,9],ymm14[10],ymm8[11],ymm14[12],ymm8[13,14],ymm14[15]
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vpermd %ymm17, %ymm4, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
-; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0]
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm27, %ymm1
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7]
-; AVX512-FCP-NEXT: vpermd %ymm23, %ymm26, %ymm3
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm19
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm18
+; AVX512-FCP-NEXT: vpermd %ymm21, %ymm4, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm30
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm12[2],xmm7[3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4],ymm13[5],ymm11[6,7],ymm13[8],ymm11[9,10],ymm13[11],ymm11[12],ymm13[13],ymm11[14,15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0]
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
+; AVX512-FCP-NEXT: vpermd %ymm20, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm29
+; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm13
+; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm13[2],ymm2[3],ymm13[4],ymm2[5,6],ymm13[7],ymm2[8,9],ymm13[10],ymm2[11],ymm13[12],ymm2[13,14],ymm13[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm12
+; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3],ymm12[4],ymm11[5,6],ymm12[7],ymm11[8,9],ymm12[10],ymm11[11],ymm12[12],ymm11[13,14],ymm12[15]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm15
-; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15]
-; AVX512-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm5 & (zmm25 ^ zmm3))
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm25, %ymm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm21
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm23
-; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm25
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm7
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm12
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15]
-; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4))
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4
-; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm4
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
-; AVX512-FCP-NEXT: vpermd %ymm4, %ymm27, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vpermd %ymm17, %ymm26, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm24, %ymm3
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5]
-; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
-; AVX512-FCP-NEXT: movb $7, %al
-; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1}
-; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3,4],xmm3[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm0
+; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm0
+; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpandnq %zmm24, %zmm27, %zmm17
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = (zmm22 & zmm27) | zmm17
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,3,1,4,6,3]
+; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4],ymm12[5],ymm11[6,7],ymm12[8],ymm11[9,10],ymm12[11],ymm11[12],ymm12[13],ymm11[14,15]
+; AVX512-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm7
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [8,9,10,11,12,21,22,23]
+; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm28, %zmm17
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0],ymm2[1,2],ymm13[3],ymm2[4],ymm13[5],ymm2[6,7],ymm13[8],ymm2[9,10],ymm13[11],ymm2[12],ymm13[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm3
+; AVX512-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vpandnq %zmm25, %zmm27, %zmm24
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = (zmm18 & zmm27) | zmm24
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm24
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3],ymm10[4],ymm9[5,6],ymm10[7],ymm9[8,9],ymm10[10],ymm9[11],ymm10[12],ymm9[13,14],ymm10[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3,4],xmm0[5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,2,0,0,5,7,2,4]
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
+; AVX512-FCP-NEXT: vpermd %ymm7, %ymm16, %ymm7
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm19
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-FCP-NEXT: vpandnq %zmm6, %zmm27, %zmm25
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & zmm27) | zmm25
+; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm28, %zmm25
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm6 = mem[0],ymm6[1],mem[2],ymm6[3],mem[4,5],ymm6[6],mem[7,8],ymm6[9],mem[10],ymm6[11],mem[12,13],ymm6[14],mem[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7],ymm13[8,9],ymm12[10],ymm13[11],ymm12[12],ymm13[13,14],ymm12[15]
-; AVX512-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4],xmm0[5,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15]
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm16, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm7
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5],ymm13[6],ymm3[7,8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13],ymm13[14],ymm3[15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm19
+; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm16
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & zmm27) | zmm3
+; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm3
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,0,0,5,0,2,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm20
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [4,1,6,3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15]
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm14
+; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm7
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm7 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm7 & (zmm26 ^ zmm1))
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm23
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm25
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm1
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm7 & (zmm30 ^ zmm0))
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm0 = mem[0,1],xmm0[2],mem[3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm1 = ymm1[0],mem[1,2],ymm1[3],mem[4],ymm1[5],mem[6,7],ymm1[8],mem[9,10],ymm1[11],mem[12],ymm1[13],mem[14,15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0]
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
+; AVX512-FCP-NEXT: vpermd %ymm21, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13],ymm11[14],ymm12[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm26, %ymm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm21
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm13
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5],ymm7[6],ymm13[7,8],ymm7[9],ymm13[10,11],ymm7[12],ymm13[13],ymm7[14],ymm13[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm30, %ymm1
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm30, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3],ymm15[4],ymm1[5,6],ymm15[7],ymm1[8,9],ymm15[10],ymm1[11],ymm15[12],ymm1[13,14],ymm15[15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,0,0,6,0,3,5]
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm16, %ymm1
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm14[1],ymm9[2,3],ymm14[4],ymm9[5],ymm14[6],ymm9[7,8],ymm14[9],ymm9[10,11],ymm14[12],ymm9[13],ymm14[14],ymm9[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm14, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-FCP-NEXT: movb $7, %al
+; AVX512-FCP-NEXT: kmovw %eax, %k1
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k1}
+; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm29, %ymm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3],xmm2[4,5,6],xmm6[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm29, %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm9
+; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5],ymm2[6],ymm9[7,8],ymm2[9],ymm9[10,11],ymm2[12],ymm9[13],ymm2[14],ymm9[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3,4],xmm14[5,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm9[2],ymm4[3],ymm9[4],ymm4[5,6],ymm9[7],ymm4[8,9],ymm9[10],ymm4[11],ymm9[12],ymm4[13,14],ymm9[15]
+; AVX512-FCP-NEXT: vpermd %ymm14, %ymm16, %ymm9
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k1}
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm13[1],ymm7[2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10],ymm13[11],ymm7[12,13],ymm13[14],ymm7[15]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5,6],xmm9[7]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm8, %ymm6
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2,3,4,5,6,7],ymm6[8],ymm2[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2
+; AVX512-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm6 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & zmm27) | zmm6
+; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm6
+; AVX512-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & zmm27) | zmm3
+; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm3
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm5
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm22, %zmm4
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm24, %zmm18, %zmm6
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdx)
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm3, (%rsi)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%rsi)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%rdx)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm3, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 64(%rcx)
+; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%rcx)
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm3, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%r9)
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
-; AVX512-FCP-NEXT: addq $552, %rsp # imm = 0x228
+; AVX512-FCP-NEXT: addq $648, %rsp # imm = 0x288
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride5_vf64:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: subq $552, %rsp # imm = 0x228
-; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm6
-; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15]
-; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: subq $344, %rsp # imm = 0x158
+; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm9
+; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1,2],ymm9[3],ymm8[4],ymm9[5],ymm8[6,7],ymm9[8],ymm8[9,10],ymm9[11],ymm8[12],ymm9[13],ymm8[14,15]
+; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
-; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm8
-; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm7
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15]
-; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm11
+; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm10
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
+; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3
-; AVX512DQ-NEXT: vporq %ymm2, %ymm3, %ymm19
-; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm15
-; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm13
-; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm12
-; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm14
+; AVX512DQ-NEXT: vporq %ymm2, %ymm3, %ymm24
+; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm13
+; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm12
+; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm14
+; AVX512DQ-NEXT: vmovdqa64 160(%rdi), %xmm16
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm10
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm9
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15]
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm7
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm6
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0
@@ -8856,64 +8861,100 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10,11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
+; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm23
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3
; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15]
-; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7]
+; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm15
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm28
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3],ymm15[4],ymm13[5,6],ymm15[7],ymm13[8,9],ymm15[10],ymm13[11],ymm15[12],ymm13[13,14],ymm15[15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm18
-; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm24
+; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm21
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
+; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm8
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm16[0,1,1,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm10
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0],xmm15[1],xmm10[2,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
+; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
+; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm11
+; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm9
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm11[1],ymm9[2,3],ymm11[4],ymm9[5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10,11],ymm11[12],ymm9[13],ymm11[14],ymm9[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm29 & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm22
+; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm19
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm14[1],xmm12[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm16
-; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm30
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0],xmm8[1],xmm14[2,3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm27
+; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm20
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm8
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm11
-; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm7
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm15[2],xmm10[3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10],ymm7[11],ymm6[12,13],ymm7[14],ymm6[15]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3]
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm1, %ymm1
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm12
-; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm15
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm28
+; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm16
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0))
@@ -8924,819 +8965,720 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm8
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 448(%rdi), %xmm3
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa %xmm11, %xmm6
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa %xmm7, %xmm9
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm8[1],xmm3[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm22
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm23
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10
-; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm1
-; AVX512DQ-NEXT: vmovdqa 608(%rdi), %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm20
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
-; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm5
-; AVX512DQ-NEXT: vmovdqa 544(%rdi), %ymm13
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10,11],ymm5[12],ymm13[13],ymm5[14],ymm13[15]
+; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm12
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 448(%rdi), %xmm9
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm2
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm15, %xmm0
+; AVX512DQ-NEXT: vmovdqa %xmm15, %xmm7
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm31 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm6
+; AVX512DQ-NEXT: vmovdqa 544(%rdi), %ymm5
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10,11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 496(%rdi), %xmm21
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 480(%rdi), %xmm7
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
-; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm11
-; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm19))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0
-; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm3
-; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1,2],ymm10[3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm16[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm24
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,2,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm6[1],xmm9[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm25
-; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm8[2],xmm4[3]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm26
-; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm22
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm30
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpsrlq $48, %xmm21, %xmm13
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
-; AVX512DQ-NEXT: vpshufb %ymm13, %ymm10, %ymm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm9
-; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm7
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm14
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9]
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm11, %xmm11
-; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7]
-; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2
-; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vpshufb %ymm13, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm24[0,3,2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm20
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm1
-; AVX512DQ-NEXT: vpsrlq $48, %xmm16, %xmm11
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm19
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3]
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm10, %xmm10
-; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1],xmm6[2],xmm5[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm23
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 496(%rdi), %xmm18
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm18[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 480(%rdi), %xmm17
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm17[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm12[1],xmm9[2,3]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7]
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm26
+; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm14
+; AVX512DQ-NEXT: vmovdqa 608(%rdi), %ymm13
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3],ymm14[4],ymm13[5,6],ymm14[7],ymm13[8,9],ymm14[10],ymm13[11],ymm14[12],ymm13[13,14],ymm14[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm11
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm11[3,4],xmm0[5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm29 & (zmm26 ^ zmm24))
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm27 & (zmm0 ^ zmm28))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm26, %ymm15
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm30
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm4
+; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5],ymm4[6],ymm1[7,8],ymm4[9],ymm1[10,11],ymm4[12],ymm1[13],ymm4[14],ymm1[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm15
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm26
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm27[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,3,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0],xmm10[1],xmm7[2,3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm22
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm7
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm3
+; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm7, %xmm7
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm0 ^ (zmm29 & (zmm23 ^ zmm0))
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm23, %ymm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm23, %zmm28
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm17[0,3,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpsrlq $48, %xmm18, %xmm8
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25]
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm12[2],xmm9[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm16
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9]
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm7, %xmm7
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm14[0],ymm13[1,2],ymm14[3],ymm13[4],ymm14[5],ymm13[6,7],ymm14[8],ymm13[9,10],ymm14[11],ymm13[12],ymm14[13],ymm13[14,15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm15
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0],xmm7[1],xmm15[2],xmm7[3]
+; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ mem))
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9]
+; AVX512DQ-NEXT: vpshufb %xmm15, %xmm7, %xmm7
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm12
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4],ymm7[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm25
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5],ymm1[6],ymm4[7,8],ymm1[9],ymm4[10,11],ymm1[12],ymm4[13],ymm1[14],ymm4[15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2],ymm7[3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm27[0,3,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpsrlq $48, %xmm26, %xmm8
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm1[0,1],xmm10[2],xmm1[3]
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm7, %xmm7
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm22
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm23
+; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3]
+; AVX512DQ-NEXT: vpshufb %xmm15, %xmm7, %xmm7
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm0 ^ (zmm29 & (zmm21 ^ zmm0))
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm21, %ymm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm24
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6],ymm0[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
-; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10,11],ymm12[12],ymm0[13],ymm12[14],ymm0[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3],ymm5[4],ymm0[5,6],ymm5[7],ymm0[8,9],ymm5[10],ymm0[11],ymm5[12],ymm0[13,14],ymm5[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm8[0],xmm4[1],xmm8[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm29
-; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm28
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm27 & (zmm13 ^ zmm2))
-; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm8
-; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm14
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
-; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm6
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,1,1,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm5
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm21[2],xmm11[3],xmm21[3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm2[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm21
-; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512DQ-NEXT: vpshufb %xmm11, %xmm10, %xmm10
-; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2
-; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm22
-; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm26
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm20[0,1,1,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm17
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm31
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm13
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3,4],xmm13[5,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm11, %xmm10, %xmm10
-; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-NEXT: vpblendw $82, (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm10 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm10, %ymm0
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm10 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm0))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm23
-; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm13
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4],ymm0[5],ymm10[6],ymm0[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm14
-; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm15
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8],ymm6[9],ymm5[10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27]
+; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm4
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm17[0,1,1,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,7,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm3
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm8[2],xmm18[2],xmm8[3],xmm18[3]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm7[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0],xmm9[1],xmm2[2,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm29 & (zmm7 ^ zmm0))
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13],ymm14[14],ymm13[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3,4],xmm8[5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm7, %ymm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm21
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4],ymm0[5],ymm7[6],ymm0[7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3
-; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1],ymm3[2],ymm8[3],ymm3[4],ymm8[5,6],ymm3[7],ymm8[8,9],ymm3[10],ymm8[11],ymm3[12],ymm8[13,14],ymm3[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm5[0],xmm6[1],xmm5[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm7
-; AVX512DQ-NEXT: vmovdqa %xmm5, %xmm6
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2],xmm2[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0],xmm4[1],xmm3[2,3]
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm8, %xmm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm7[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm9[2],xmm2[3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13]
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm0))
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2
-; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm9
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3],xmm0[4,5],xmm9[6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm0))
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1,2,3],xmm0[4,5],xmm8[6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm8, %ymm9
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm25
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4],ymm0[5,6],ymm8[7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm11, %xmm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm7[2],xmm6[3]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm4
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm7, %ymm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm7, %zmm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4],ymm0[5,6],ymm7[7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5],ymm12[6],ymm15[7,8],ymm12[9],ymm15[10,11],ymm12[12],ymm15[13],ymm12[14],ymm15[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm7[3,4],xmm9[5,6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm4[2],xmm3[3]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpshufb %xmm7, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm2
; AVX512DQ-NEXT: movb $7, %al
; AVX512DQ-NEXT: kmovw %eax, %k1
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8],ymm10[9],ymm2[10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6],xmm5[7]
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm2 {%k1}
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm5
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8],ymm14[9],ymm13[10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm5
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm2
-; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm2
-; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3]
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT: vpblendw $107, (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm4 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm4, %ymm4
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm5 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1}
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm2
-; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1,2,3,4,5,6,7],ymm5[8],ymm1[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm2
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3]
+; AVX512DQ-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vinserti32x4 $1, %xmm31, %ymm0, %ymm3
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm3
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15]
; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7]
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7]
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm2 {%k1}
+; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm3
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
-; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vmovaps %zmm2, (%rsi)
-; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rsi)
-; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rdx)
-; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx)
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQ-NEXT: vmovdqa64 %zmm28, (%rsi)
+; AVX512DQ-NEXT: vmovdqa64 %zmm30, 64(%rsi)
+; AVX512DQ-NEXT: vmovdqa64 %zmm25, 64(%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%rcx)
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rcx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rcx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm25, 64(%r8)
+; AVX512DQ-NEXT: vmovaps %zmm2, (%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%r8)
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-NEXT: vmovaps %zmm2, (%r8)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%r9)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9)
-; AVX512DQ-NEXT: addq $552, %rsp # imm = 0x228
+; AVX512DQ-NEXT: addq $344, %rsp # imm = 0x158
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i16_stride5_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $552, %rsp # imm = 0x228
+; AVX512DQ-FCP-NEXT: subq $648, %rsp # imm = 0x288
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa 496(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 544(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm28
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm30
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm26
+; AVX512DQ-FCP-NEXT: vmovdqa 496(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm28
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm29
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa 544(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm17
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm27
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [2,4,7,1,4,6,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm20, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [8,9,3,2,4,5,7,6]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm14, %ymm3
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,1,3,0,3,5,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %ymm21
+; AVX512DQ-FCP-NEXT: vpermd %ymm21, %ymm15, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm9
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15]
; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm27
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3]
-; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm7
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6]
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm17, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm31
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm15
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4],ymm11[5],ymm7[6,7],ymm11[8],ymm7[9,10],ymm11[11],ymm7[12],ymm11[13],ymm7[14,15]
-; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5],ymm6[6],ymm12[7,8],ymm6[9],ymm12[10,11],ymm6[12],ymm12[13],ymm6[14],ymm12[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm13
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10,11],ymm8[12],ymm13[13],ymm8[14],ymm13[15]
-; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm10, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm5[2],ymm9[3],ymm5[4],ymm9[5,6],ymm5[7],ymm9[8,9],ymm5[10],ymm9[11],ymm5[12],ymm9[13,14],ymm5[15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm25
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm17, %ymm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
-; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,0,2,4,6,1,3]
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm16, %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm11, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm18
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm12
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10,11],ymm13[12],ymm11[13],ymm13[14],ymm11[15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm20, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm14, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm20
+; AVX512DQ-FCP-NEXT: vpermd %ymm20, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm30
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm21
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm29
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm14[1,2],ymm6[3],ymm14[4],ymm6[5],ymm14[6,7],ymm6[8],ymm14[9,10],ymm6[11],ymm14[12],ymm6[13],ymm14[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm19
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm22
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm17, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm31, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm5
-; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm20, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3]
-; AVX512DQ-FCP-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15]
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm24, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm10
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm18
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm28
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15]
-; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15]
-; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm20, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm27, %xmm4
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15]
-; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm24, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm16, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5],ymm5[6],ymm1[7,8],ymm5[9],ymm1[10,11],ymm5[12],ymm1[13],ymm5[14],ymm1[15]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,0,0,0,4,7,1,6]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm16, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm2, %ymm22
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm6
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm18, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm4
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,5,7,4,7,0,0]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5],ymm11[6],ymm13[7,8],ymm11[9],ymm13[10,11],ymm11[12],ymm13[13],ymm11[14],ymm13[15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm25, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [1,3,2,3,1,3,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm20, %ymm26, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm24
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm20
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm30
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm26
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm29
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm21
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm25, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vporq %ymm1, %ymm0, %ymm18
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm8[1],ymm14[2,3],ymm8[4],ymm14[5],ymm8[6],ymm14[7,8],ymm8[9],ymm14[10,11],ymm8[12],ymm14[13],ymm8[14],ymm14[15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm25, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm10
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm9
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm28, %xmm3
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm21, %ymm26, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm25
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [4,5,14,15,4,5,14,15,4,5,14,15,4,5,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm1
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm7
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,3,5,2,5,7,0,0]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm13[1],ymm11[2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7,8],ymm13[9],ymm11[10],ymm13[11],ymm11[12,13],ymm13[14],ymm11[15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm16, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0]
; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm5, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0
+; AVX512DQ-FCP-NEXT: vpermd %ymm20, %ymm5, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15]
+; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm27, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm28[2],xmm0[3],xmm28[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm21, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm13
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm14
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm12
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm3 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm16
-; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm25, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0],xmm12[1],xmm7[2,3]
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15]
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm18, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,6,0,5,0,0,0]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0,1],ymm13[2],ymm11[3],ymm13[4],ymm11[5,6],ymm13[7],ymm11[8,9],ymm13[10],ymm11[11],ymm13[12],ymm11[13,14],ymm13[15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0]
; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm4, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm31
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm20
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm30
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm18, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vpermd %ymm20, %ymm4, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm26
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0],xmm10[1],xmm9[2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1],ymm14[2],ymm8[3],ymm14[4],ymm8[5,6],ymm14[7],ymm8[8,9],ymm14[10],ymm8[11],ymm14[12],ymm8[13,14],ymm14[15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
-; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm27, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm26, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm19
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm18
+; AVX512DQ-FCP-NEXT: vpermd %ymm21, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm30
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm12[2],xmm7[3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4],ymm13[5],ymm11[6,7],ymm13[8],ymm11[9,10],ymm13[11],ymm11[12],ymm13[13],ymm11[14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm20, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm29
+; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm13[2],ymm2[3],ymm13[4],ymm2[5,6],ymm13[7],ymm2[8,9],ymm13[10],ymm2[11],ymm13[12],ymm2[13,14],ymm13[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3],ymm12[4],ymm11[5,6],ymm12[7],ymm11[8,9],ymm12[10],ymm11[11],ymm12[12],ymm11[13,14],ymm12[15]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15]
-; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm5 & (zmm25 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm25, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm21
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm23
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm25
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm12
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15]
-; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm27, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm26, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm24, %ymm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5]
-; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: movb $7, %al
-; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3,4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpandnq %zmm24, %zmm27, %zmm17
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = (zmm22 & zmm27) | zmm17
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,3,1,4,6,3]
+; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4],ymm12[5],ymm11[6,7],ymm12[8],ymm11[9,10],ymm12[11],ymm11[12],ymm12[13],ymm11[14,15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [8,9,10,11,12,21,22,23]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm28, %zmm17
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0],ymm2[1,2],ymm13[3],ymm2[4],ymm13[5],ymm2[6,7],ymm13[8],ymm2[9,10],ymm13[11],ymm2[12],ymm13[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vpandnq %zmm25, %zmm27, %zmm24
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = (zmm18 & zmm27) | zmm24
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm24
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm10[2],ymm9[3],ymm10[4],ymm9[5,6],ymm10[7],ymm9[8,9],ymm10[10],ymm9[11],ymm10[12],ymm9[13,14],ymm10[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3,4],xmm0[5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,2,0,0,5,7,2,4]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm16, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm19
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vpandnq %zmm6, %zmm27, %zmm25
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & zmm27) | zmm25
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm28, %zmm25
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm6 = mem[0],ymm6[1],mem[2],ymm6[3],mem[4,5],ymm6[6],mem[7,8],ymm6[9],mem[10],ymm6[11],mem[12,13],ymm6[14],mem[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7],ymm13[8,9],ymm12[10],ymm13[11],ymm12[12],ymm13[13,14],ymm12[15]
-; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm4
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3,4],xmm0[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5],ymm4[6],ymm2[7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13],ymm4[14],ymm2[15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm13[1],ymm3[2,3],ymm13[4],ymm3[5],ymm13[6],ymm3[7,8],ymm13[9],ymm3[10,11],ymm13[12],ymm3[13],ymm13[14],ymm3[15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm16
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & zmm27) | zmm3
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm3
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,0,0,5,0,2,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm20
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [4,1,6,3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm14
+; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm7 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm7 & (zmm26 ^ zmm1))
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4],ymm6[5],ymm5[6,7],ymm6[8],ymm5[9,10],ymm6[11],ymm5[12],ymm6[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm23
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm25
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm7 & (zmm30 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm0 = mem[0,1],xmm0[2],mem[3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm1 = ymm1[0],mem[1,2],ymm1[3],mem[4],ymm1[5],mem[6,7],ymm1[8],mem[9,10],ymm1[11],mem[12],ymm1[13],mem[14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,4,6,3,6,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm21, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13],ymm11[14],ymm12[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm26, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm21
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm7
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5],ymm7[6],ymm13[7,8],ymm7[9],ymm13[10,11],ymm7[12],ymm13[13],ymm7[14],ymm13[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm30, %ymm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3],ymm15[4],ymm1[5,6],ymm15[7],ymm1[8,9],ymm15[10],ymm1[11],ymm15[12],ymm1[13,14],ymm15[15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,0,0,6,0,3,5]
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm9[0],ymm14[1],ymm9[2,3],ymm14[4],ymm9[5],ymm14[6],ymm9[7,8],ymm14[9],ymm9[10,11],ymm14[12],ymm9[13],ymm14[14],ymm9[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3,4],xmm15[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: movb $7, %al
+; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29 {%k1}
+; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm29, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm12[1],ymm11[2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,3],xmm2[4,5,6],xmm6[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm29, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2,3],ymm2[4],ymm9[5],ymm2[6],ymm9[7,8],ymm2[9],ymm9[10,11],ymm2[12],ymm9[13],ymm2[14],ymm9[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3,4],xmm14[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0,1],ymm9[2],ymm4[3],ymm9[4],ymm4[5,6],ymm9[7],ymm4[8,9],ymm9[10],ymm4[11],ymm9[12],ymm4[13,14],ymm9[15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm16, %ymm9
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm8 {%k1}
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm13[1],ymm7[2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7,8],ymm13[9],ymm7[10],ymm13[11],ymm7[12,13],ymm13[14],ymm7[15]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm9
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm9[2,3],xmm2[4,5,6],xmm9[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm8, %ymm6
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2,3,4,5,6,7],ymm6[8],ymm2[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2
+; AVX512DQ-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = (zmm4 & zmm27) | zmm6
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm6
+; AVX512DQ-FCP-NEXT: vpandnq {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = (zmm8 & zmm27) | zmm3
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm28, %zmm3
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm5
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm3
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm22, %zmm4
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm24, %zmm18, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rdx)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rsi)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rdx)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 64(%rcx)
+; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rcx)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%r9)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
-; AVX512DQ-FCP-NEXT: addq $552, %rsp # imm = 0x228
+; AVX512DQ-FCP-NEXT: addq $648, %rsp # imm = 0x288
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index feb75b21d5c8d..5e75ffa465b61 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -483,28 +483,29 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpsrld $16, %xmm1, %xmm4
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpsrld $16, %xmm1, %xmm3
+; AVX2-NEXT: vmovdqa (%rdi), %xmm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
-; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
+; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm2[2,3]
; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-NEXT: vmovq %xmm4, (%rdx)
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-NEXT: vmovq %xmm3, (%rdx)
; AVX2-NEXT: vmovq %xmm6, (%rcx)
; AVX2-NEXT: vmovq %xmm5, (%r8)
-; AVX2-NEXT: vmovq %xmm1, (%r9)
-; AVX2-NEXT: vmovq %xmm0, (%rax)
+; AVX2-NEXT: vmovq %xmm2, (%r9)
+; AVX2-NEXT: vmovq %xmm1, (%rax)
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i16_stride6_vf4:
@@ -513,27 +514,28 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpsrld $16, %xmm1, %xmm4
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
-; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpsrld $16, %xmm1, %xmm3
+; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm4
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
+; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm2[2,3]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-FP-NEXT: vmovq %xmm4, (%rdx)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
; AVX2-FP-NEXT: vmovq %xmm6, (%rcx)
; AVX2-FP-NEXT: vmovq %xmm5, (%r8)
-; AVX2-FP-NEXT: vmovq %xmm1, (%r9)
-; AVX2-FP-NEXT: vmovq %xmm0, (%rax)
+; AVX2-FP-NEXT: vmovq %xmm2, (%r9)
+; AVX2-FP-NEXT: vmovq %xmm1, (%rax)
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i16_stride6_vf4:
@@ -542,152 +544,143 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpsrld $16, %xmm1, %xmm4
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm2[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpsrld $16, %xmm1, %xmm3
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm4
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3],xmm3[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm2[2,3]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0],xmm5[1,2],xmm1[3]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi)
+; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
; AVX2-FCP-NEXT: vmovq %xmm6, (%rcx)
; AVX2-FCP-NEXT: vmovq %xmm5, (%r8)
-; AVX2-FCP-NEXT: vmovq %xmm1, (%r9)
-; AVX2-FCP-NEXT: vmovq %xmm0, (%rax)
+; AVX2-FCP-NEXT: vmovq %xmm2, (%r9)
+; AVX2-FCP-NEXT: vmovq %xmm1, (%rax)
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i16_stride6_vf4:
; AVX512: # %bb.0:
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512-NEXT: vpermd %zmm2, %zmm1, %zmm1
-; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,9,6,3]
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512-NEXT: vpermd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpsrld $16, 16(%rdi), %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = mem[0,1,0,3]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],mem[3],xmm2[4,5,6,7]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,1,10,7]
+; AVX512-NEXT: vpermd %zmm1, %zmm3, %zmm3
+; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512-NEXT: vpermd %zmm2, %zmm5, %zmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovq %xmm3, (%rsi)
-; AVX512-NEXT: vmovq %xmm0, (%rdx)
+; AVX512-NEXT: vpermd %zmm1, %zmm5, %zmm1
+; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-NEXT: vmovq %xmm2, (%rdx)
; AVX512-NEXT: vmovq %xmm4, (%rcx)
-; AVX512-NEXT: vmovq %xmm1, (%r8)
+; AVX512-NEXT: vmovq %xmm3, (%r8)
; AVX512-NEXT: vmovq %xmm5, (%r9)
-; AVX512-NEXT: vmovq %xmm2, (%rax)
+; AVX512-NEXT: vmovq %xmm1, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride6_vf4:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,9,6,3]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpsrld $16, 16(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],mem[3],xmm2[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,1,10,7]
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovq %xmm0, (%rsi)
+; AVX512-FCP-NEXT: vmovq %xmm2, (%rdx)
; AVX512-FCP-NEXT: vmovq %xmm4, (%rcx)
-; AVX512-FCP-NEXT: vmovq %xmm1, (%r8)
+; AVX512-FCP-NEXT: vmovq %xmm3, (%r8)
; AVX512-FCP-NEXT: vmovq %xmm5, (%r9)
-; AVX512-FCP-NEXT: vmovq %xmm2, (%rax)
+; AVX512-FCP-NEXT: vmovq %xmm1, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride6_vf4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,9,6,3]
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-NEXT: vpermd %zmm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpsrld $16, 16(%rdi), %xmm2
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = mem[0,1,0,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,7]
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],mem[3],xmm2[4,5,6,7]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,1,10,7]
+; AVX512DQ-NEXT: vpermd %zmm1, %zmm3, %zmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512DQ-NEXT: vpermd %zmm2, %zmm5, %zmm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm0, (%rdx)
+; AVX512DQ-NEXT: vpermd %zmm1, %zmm5, %zmm1
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovq %xmm0, (%rsi)
+; AVX512DQ-NEXT: vmovq %xmm2, (%rdx)
; AVX512DQ-NEXT: vmovq %xmm4, (%rcx)
-; AVX512DQ-NEXT: vmovq %xmm1, (%r8)
+; AVX512DQ-NEXT: vmovq %xmm3, (%r8)
; AVX512DQ-NEXT: vmovq %xmm5, (%r9)
-; AVX512DQ-NEXT: vmovq %xmm2, (%rax)
+; AVX512DQ-NEXT: vmovq %xmm1, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i16_stride6_vf4:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,1,10,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,9,6,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,8,9,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpsrld $16, 16(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,2,3,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],mem[3],xmm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,1,10,7]
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[4,5,0,1,12,13,8,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[6,7,2,3,14,15,10,11,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [8,5,2,11]
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx)
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[8,9,4,5,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rdx)
; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r8)
+; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r8)
; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9)
-; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rax)
+; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -1028,16 +1021,16 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm6
; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,0,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6]
; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
@@ -1045,13 +1038,13 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15]
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
; AVX2-NEXT: vmovdqa %xmm2, (%rsi)
; AVX2-NEXT: vmovdqa %xmm5, (%rdx)
@@ -1092,15 +1085,15 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm6
; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
@@ -1108,13 +1101,13 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsi)
; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx)
@@ -1129,20 +1122,20 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP: # %bb.0:
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4,5],xmm3[6,7]
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm3
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [6,5,4,7]
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6
+; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX2-FCP-NEXT: vpslld $16, %xmm3, %xmm7
; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
@@ -1150,36 +1143,36 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm6
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,7,6,5]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm6
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm6[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[0,1,2,3,4,5,6,7,8,9,0,1,12,13,8,9]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,6,7,14,15,2,3,14,15,10,11]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,2,2,2,4,5,6,7]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[0,1,4,5,4,5,6,7,0,1,4,5,0,1,12,13]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[6,7,2,3,4,5,6,7,6,7,6,7,2,3,14,15]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX2-FCP-NEXT: vmovdqa %xmm2, (%rsi)
; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx)
; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rcx)
; AVX2-FCP-NEXT: vmovdqa %xmm6, (%r8)
@@ -1197,25 +1190,22 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX512-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm7
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm7
; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm6, %xmm7, %xmm7
+; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3],xmm7[4,5],xmm4[6,7]
; AVX512-NEXT: vpbroadcastw 74(%rdi), %xmm6
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm6
@@ -1232,22 +1222,22 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,2,2,2,4,5,6,7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6,7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
; AVX512-NEXT: vmovdqa %xmm2, (%rsi)
-; AVX512-NEXT: vmovdqa %xmm5, (%rdx)
+; AVX512-NEXT: vmovdqa %xmm4, (%rdx)
; AVX512-NEXT: vmovdqa %xmm8, (%rcx)
; AVX512-NEXT: vmovdqa %xmm6, (%r8)
; AVX512-NEXT: vmovdqa %xmm1, (%r9)
@@ -1258,63 +1248,64 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-LABEL: load_i16_stride6_vf8:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm3
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm3
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [6,5,4,7]
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX512-FCP-NEXT: vpslld $16, %xmm3, %xmm7
; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3]
-; AVX512-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm7
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7]
+; AVX512-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm6
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [10,1,10,3]
+; AVX512-FCP-NEXT: vpermd (%rdi), %zmm6, %zmm6
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [4,7,6,5]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1,2],xmm6[3],xmm8[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4],xmm4[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi)
; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm8, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rcx)
; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r8)
; AVX512-FCP-NEXT: vmovdqa %xmm4, (%r9)
-; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rax)
+; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -1327,25 +1318,22 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm7
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[0,1,12,13,u,u,4,5,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm7
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,0,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3],xmm8[4,5],xmm6[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm7, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3],xmm7[4,5],xmm4[6,7]
; AVX512DQ-NEXT: vpbroadcastw 74(%rdi), %xmm6
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,1,1,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm6
@@ -1362,22 +1350,22 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[2,2,2,2,4,5,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4],xmm1[5,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3],xmm5[4],xmm3[5,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7]
; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsi)
-; AVX512DQ-NEXT: vmovdqa %xmm5, (%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm4, (%rdx)
; AVX512DQ-NEXT: vmovdqa %xmm8, (%rcx)
; AVX512DQ-NEXT: vmovdqa %xmm6, (%r8)
; AVX512DQ-NEXT: vmovdqa %xmm1, (%r9)
@@ -1388,63 +1376,64 @@ define void @load_i16_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-LABEL: load_i16_stride6_vf8:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,1,0,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm3
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [6,5,4,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX512DQ-FCP-NEXT: vpslld $16, %xmm3, %xmm7
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm8 = xmm4[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm7[3]
-; AVX512DQ-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3,4],xmm6[5],xmm5[6],xmm6[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,1,2,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2],xmm5[3],xmm6[4,5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastw 74(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [10,1,10,3]
+; AVX512DQ-FCP-NEXT: vpermd (%rdi), %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [4,7,6,5]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2],xmm4[3]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,u,0,1,12,13,8,9]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,2,2,2,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3],xmm7[4],xmm9[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm10[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1,2],xmm6[3],xmm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,2,3,14,15,10,11]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[8,9,u,u,0,1,12,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,u,u,u,4,5,0,1,12,13]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3,4],xmm4[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi)
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3,4],xmm4[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,u,u,u,u,u,6,7,2,3,14,15]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r8)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -2055,31 +2044,32 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX2-LABEL: load_i16_stride6_vf16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm4
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3
-; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: vmovdqa (%rdi), %ymm3
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4
+; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX2-NEXT: vmovdqa 96(%rdi), %ymm5
+; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0
; AVX2-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm8[2,2,2,2,4,5,6,7]
; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3,4],xmm9[5,6,7]
; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,6,7,0,1,12,13,8,9,4,5]
; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10
-; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm11[0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm12
; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6,7]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm3[0,1]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm5[2,3]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm1[0,1],ymm5[0,1]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0,1,2],ymm10[3,4,5,6,7],ymm1[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm12[2,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
@@ -2091,11 +2081,11 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6],xmm9[7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1,2],ymm8[3,4,5,6,7],ymm3[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
@@ -2105,303 +2095,296 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6,7],ymm12[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm12
; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[0,0,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,6,6,6,6]
; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,6,5,4]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,1,1,1,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
+; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,3,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,7,7,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,2]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[2,2,2,2,4,5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0]
; AVX2-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,1,0,2,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,6,6,6]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3],xmm4[4],xmm2[5,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendvb %ymm11, %ymm2, %ymm5, %ymm2
-; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,u,u,u,8,9,u,u,0,1,12,13]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,0,2,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,6,6,6]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
-; AVX2-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,10,11,u,u,2,3,14,15]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT: vmovdqa %ymm1, (%rsi)
+; AVX2-NEXT: vmovdqa %ymm5, (%rdx)
; AVX2-NEXT: vmovdqa %ymm8, (%rcx)
; AVX2-NEXT: vmovdqa %ymm9, (%r8)
-; AVX2-NEXT: vmovdqa %ymm5, (%r9)
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: vmovdqa %ymm1, (%rax)
+; AVX2-NEXT: vmovdqa %ymm7, (%r9)
+; AVX2-NEXT: vmovdqa %ymm0, (%rax)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i16_stride6_vf16:
; AVX2-FP: # %bb.0:
+; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3
-; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm4
-; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4
+; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm2
+; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm5
+; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0
+; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm7
; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm6
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %xmm6, %xmm11, %xmm7
; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12
; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
; AVX2-FP-NEXT: vpshufb %xmm6, %xmm12, %xmm6
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7]
-; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3]
-; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3]
+; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm11, %xmm10
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[1,1,1,1,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6],xmm10[7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
+; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6,7],ymm12[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm11
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,5]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7]
-; AVX2-FP-NEXT: vpshufb %ymm9, %ymm15, %ymm9
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15]
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm5, %ymm7, %ymm5
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0
-; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm6
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
+; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0]
+; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm10
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm10, %xmm12
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm12[4],xmm1[5],xmm12[6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
-; AVX2-FP-NEXT: vmovdqa %ymm4, (%rdx)
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm7, %xmm4, %xmm4
+; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm10, %xmm6
+; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FP-NEXT: vmovdqa %ymm2, (%rsi)
+; AVX2-FP-NEXT: vmovdqa %ymm5, (%rdx)
; AVX2-FP-NEXT: vmovdqa %ymm8, (%rcx)
; AVX2-FP-NEXT: vmovdqa %ymm9, (%r8)
-; AVX2-FP-NEXT: vmovdqa %ymm2, (%r9)
-; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-FP-NEXT: vmovdqa %ymm1, (%r9)
; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i16_stride6_vf16:
; AVX2-FCP: # %bb.0:
+; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
-; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2
+; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm5
+; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
+; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm7
; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3],xmm6[4,5],xmm7[6],xmm6[7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm10
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm7
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [6,5,4,7]
+; AVX2-FCP-NEXT: vpermd %ymm11, %ymm12, %ymm12
; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1],xmm6[2],xmm7[3],xmm6[4,5],xmm7[6,7]
-; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm4[2,3]
-; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm0[0,1],ymm4[0,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm5[2,3]
+; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm2[0,1],ymm5[0,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3,4,5],ymm6[6],ymm7[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1,2],ymm10[3,4,5,6,7],ymm2[8,9,10],ymm10[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm10
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2],xmm10[3,4],xmm11[5],xmm10[6],xmm11[7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm10
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[2,1,0,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm12[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5,6],xmm11[7]
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[6,7,2,3,u,u,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2],xmm10[3,4],xmm9[5],xmm10[6],xmm9[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,3,6,7,4,5,0,1,10,11,14,15,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm5[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2],ymm8[3,4,5,6,7],ymm5[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,5,6,5]
+; AVX2-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm9
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm11[2,1,2,3]
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4],xmm0[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7]
-; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm9
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2],xmm10[3],xmm11[4,5,6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,2]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,2,2,2,4,5,6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0],xmm5[1],xmm10[2,3],xmm5[4],xmm10[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0,1,2],ymm8[3,4,5,6,7],ymm12[8,9,10],ymm8[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [4,7,6,5]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7]
+; AVX2-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1,2],xmm15[3],xmm14[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1,2],xmm12[3],xmm11[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm5, %ymm7, %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm0, %ymm3, %ymm0
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm6
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
+; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,0,0,0]
+; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm10, %ymm7, %ymm7
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm12
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm12[4],xmm1[5],xmm12[6,7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
-; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rdx)
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm3
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm6
+; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rsi)
+; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rdx)
; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rcx)
; AVX2-FCP-NEXT: vmovdqa %ymm9, (%r8)
-; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r9)
-; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r9)
; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
@@ -2528,8 +2511,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm5
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [6,5,4,7]
+; AVX512-FCP-NEXT: vpermd %ymm8, %ymm6, %ymm9
; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7]
; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm4[2,3],mem[2,3]
@@ -2539,6 +2522,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm5
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12
; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13
@@ -2546,7 +2530,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8
; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm9
@@ -2559,68 +2544,74 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [10,1,10,3]
+; AVX512-FCP-NEXT: vpermd (%rdi), %zmm8, %zmm9
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,7,6,5]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpermd %ymm11, %ymm10, %ymm10
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1,2],xmm8[3],xmm11[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1,2,3,4],xmm12[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [12,5,14,5]
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm14
+; AVX512-FCP-NEXT: vpermd %zmm14, %zmm12, %zmm12
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,6,5,6,4]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [2,9,8,11]
+; AVX512-FCP-NEXT: vpermd %zmm14, %zmm16, %zmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7],ymm8[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1,2],xmm1[3],xmm9[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm9[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,7,5,6,5]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5,6],xmm9[7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm7) | ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6],ymm5[7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm12
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm10
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [0,3,2,9]
+; AVX512-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm11
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm10[4],xmm5[5],xmm10[6,7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm11)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm7)
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rsi)
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, (%rsi)
; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rdx)
; AVX512-FCP-NEXT: vmovdqa %ymm8, (%rcx)
; AVX512-FCP-NEXT: vmovdqa %ymm9, (%r8)
@@ -2752,8 +2743,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm5
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[2,1,0,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [6,5,4,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm6, %ymm9
; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4,5],xmm5[6,7]
; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm4[2,3],mem[2,3]
@@ -2763,6 +2754,7 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm5[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm5
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1],ymm5[2],ymm0[3,4],ymm5[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm18
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13
@@ -2770,7 +2762,8 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm12[3],xmm11[4,5],xmm12[6],xmm11[7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8
; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm9, %xmm9
@@ -2783,68 +2776,74 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7],ymm4[8,9,10],ymm8[11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm9[0,1,2,3,6,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[2,1,0,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm10[4],xmm8[5,6],xmm10[7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[2,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,1,2,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0,1,2],ymm8[3,4,5,6,7],ymm15[8,9,10],ymm8[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4],xmm15[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4],xmm11[5,6],xmm9[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1,2],xmm11[3],xmm12[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4],xmm10[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [10,1,10,3]
+; AVX512DQ-FCP-NEXT: vpermd (%rdi), %zmm8, %zmm9
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [4,7,6,5]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1,2],xmm8[3],xmm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm6[2],ymm7[3],ymm6[4],ymm7[5,6],ymm6[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm11[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1,2,3,4],xmm12[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [12,5,14,5]
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm14
+; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,6,5,6,4]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [2,9,8,11]
+; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4],xmm1[5,6],xmm15[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2],ymm1[3,4,5,6,7],ymm8[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm9[1,2],xmm1[3],xmm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm11[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm9[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5,6],xmm9[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm11) | ymm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3],ymm5[4],ymm0[5,6],ymm5[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3],xmm6[4],xmm7[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm7) | ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6],ymm5[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm12
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm10
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [0,3,2,9]
+; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm11, %zmm11
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm10[4],xmm5[5],xmm10[6,7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm11)
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm7)
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rsi)
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%r8)
@@ -2857,36 +2856,28 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-LABEL: load_i16_stride6_vf16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
-; AVX512BW-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [32,33,34,2,8,14,20,26,40,41,42,2,8,14,20,26]
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3
; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4
; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512BW-NEXT: vpermw %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512BW-NEXT: vpermi2w %zmm1, %zmm5, %zmm0
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
-; AVX512BW-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512BW-NEXT: vpermw %zmm5, %zmm1, %zmm1
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [32,33,34,3,9,15,21,27,40,41,42,3,9,15,21,27]
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512BW-NEXT: vpermi2w %zmm2, %zmm5, %zmm1
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28]
-; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512BW-NEXT: vpermw %zmm5, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [32,33,34,4,10,16,22,28,40,41,42,4,10,16,22,28]
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm2
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29]
-; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX512BW-NEXT: vpermw %zmm5, %zmm6, %zmm6
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [32,33,34,5,11,17,23,29,40,41,42,5,11,17,23,29]
; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
+; AVX512BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm6
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
@@ -2912,36 +2903,28 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-FCP-LABEL: load_i16_stride6_vf16:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
-; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [32,33,34,2,8,14,20,26,40,41,42,2,8,14,20,26]
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3
; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4
; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm5, %zmm0
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
-; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [32,33,34,3,9,15,21,27,40,41,42,3,9,15,21,27]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm5, %zmm1
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28]
-; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [32,33,34,4,10,16,22,28,40,41,42,4,10,16,22,28]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm2
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29]
-; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm6, %zmm6
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [32,33,34,5,11,17,23,29,40,41,42,5,11,17,23,29]
; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
+; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm6
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
@@ -2967,36 +2950,28 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-BW-LABEL: load_i16_stride6_vf16:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
-; AVX512DQ-BW-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [32,33,34,2,8,14,20,26,40,41,42,2,8,14,20,26]
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3
; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4
; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm5, %zmm0
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
-; AVX512DQ-BW-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [32,33,34,3,9,15,21,27,40,41,42,3,9,15,21,27]
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm2
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm5, %zmm1
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28]
-; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [32,33,34,4,10,16,22,28,40,41,42,4,10,16,22,28]
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-BW-NEXT: vpermi2w %zmm6, %zmm5, %zmm2
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29]
-; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT: vpermw %zmm5, %zmm6, %zmm6
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [32,33,34,5,11,17,23,29,40,41,42,5,11,17,23,29]
; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
+; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm5, %zmm6
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1]
@@ -3022,36 +2997,28 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-BW-FCP-LABEL: load_i16_stride6_vf16:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,0,2,8,14,20,26,0,0,0,2,8,14,20,26]
-; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [32,33,34,2,8,14,20,26,40,41,42,2,8,14,20,26]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0]
; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm5, %zmm0
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
-; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [32,33,34,3,9,15,21,27,40,41,42,3,9,15,21,27]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0]
; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm5, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,16,16,4,10,16,22,28,16,16,16,4,10,16,22,28]
-; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [32,33,34,4,10,16,22,28,40,41,42,4,10,16,22,28]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0]
; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3,4,5,6,7],ymm6[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm6, %zmm5, %zmm2
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [16,16,16,5,11,17,23,29,16,16,16,5,11,17,23,29]
-; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [32,33,34,5,11,17,23,29,40,41,42,5,11,17,23,29]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0]
; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm5, %zmm6
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
@@ -4135,51 +4102,50 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX2-LABEL: load_i16_stride6_vf32:
; AVX2: # %bb.0:
-; AVX2-NEXT: subq $488, %rsp # imm = 0x1E8
-; AVX2-NEXT: vmovdqa (%rdi), %ymm5
+; AVX2-NEXT: subq $520, %rsp # imm = 0x208
+; AVX2-NEXT: vmovdqa (%rdi), %ymm12
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm7
-; AVX2-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill
; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1
-; AVX2-NEXT: vmovdqa 224(%rdi), %ymm10
-; AVX2-NEXT: vmovdqa 192(%rdi), %ymm11
+; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7
+; AVX2-NEXT: vmovdqa 192(%rdi), %ymm10
; AVX2-NEXT: vmovdqa 288(%rdi), %ymm2
; AVX2-NEXT: vmovdqa 256(%rdi), %ymm3
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3]
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[2,3],ymm2[2,3]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm3[0,1],ymm2[0,1]
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
-; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm4
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm0
+; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm4
+; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm12[2],ymm5[3,4],ymm12[5],ymm5[6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm0
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[2,2,2,2,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm9[1],xmm0[2,3],xmm9[4],xmm0[5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5,6,7]
; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-NEXT: vpblendvb %ymm0, %ymm9, %ymm4, %ymm4
+; AVX2-NEXT: vpblendvb %ymm0, %ymm11, %ymm4, %ymm4
; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa %ymm11, %ymm5
-; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7]
-; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm10[2],ymm7[3,4],ymm10[5],ymm7[6,7]
+; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm11
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm9
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5,6,7]
-; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[2,2,2,2,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3],xmm13[4],xmm11[5,6,7]
; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX2-NEXT: vpshufb %ymm6, %ymm11, %ymm6
-; AVX2-NEXT: vpblendvb %ymm0, %ymm8, %ymm6, %ymm6
+; AVX2-NEXT: vmovdqu %ymm15, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm15[1],ymm14[2,3,4,5],ymm15[6],ymm14[7]
+; AVX2-NEXT: vpshufb %ymm8, %ymm13, %ymm8
+; AVX2-NEXT: vpblendvb %ymm0, %ymm11, %ymm8, %ymm6
; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
@@ -4187,23 +4153,23 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpshufb %ymm3, %ymm11, %ymm1
-; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm3
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm10[1],ymm5[2,3],ymm10[4],ymm5[5,6],ymm10[7]
+; AVX2-NEXT: vpshufb %ymm3, %ymm13, %ymm1
+; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm3
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6],ymm7[7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,1,2,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15]
; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm9
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm1
+; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
-; AVX2-NEXT: vpshufb %ymm15, %ymm12, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0],ymm14[1],ymm15[2,3,4,5],ymm14[6],ymm15[7]
+; AVX2-NEXT: vpshufb %ymm10, %ymm15, %ymm1
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-NEXT: vmovdqa 352(%rdi), %ymm11
; AVX2-NEXT: vmovdqa 320(%rdi), %ymm13
@@ -4211,8 +4177,8 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm8
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4],xmm8[5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm7
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3,4],xmm7[5,6,7]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
@@ -4221,28 +4187,28 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 128(%rdi), %ymm14
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7]
+; AVX2-NEXT: vmovdqa 128(%rdi), %ymm3
+; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,2,2,2,4,5,6,7]
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm5
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3,4],xmm5[5,6,7]
; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm6
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $146, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm7
+; AVX2-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm0 = ymm12[0],mem[1],ymm12[2,3],mem[4],ymm12[5,6],mem[7]
+; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm8
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,0,3]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm10 = mem[0],ymm10[1],mem[2,3,4,5],ymm10[6],mem[7]
-; AVX2-NEXT: vpshufb %ymm15, %ymm10, %ymm15
-; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm15[3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,0,3]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm12[2],xmm8[3],xmm12[4,5],xmm8[6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm12 = mem[0],ymm14[1],mem[2,3,4,5],ymm14[6],mem[7]
+; AVX2-NEXT: vpshufb %ymm10, %ymm12, %ymm10
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm9[2,1,0,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
@@ -4250,22 +4216,22 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3,4],xmm6[5],xmm2[6],xmm6[7]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-NEXT: vpshufb %ymm6, %ymm12, %ymm7
+; AVX2-NEXT: vpshufb %ymm6, %ymm15, %ymm8
; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-NEXT: vpshufb %xmm7, %xmm8, %xmm8
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm7
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3],xmm8[4,5],xmm1[6],xmm8[7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3],xmm7[4,5],xmm1[6],xmm7[7]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpshufb %xmm7, %xmm5, %xmm1
+; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm1
; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,5,5,5]
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
-; AVX2-NEXT: vpshufb %ymm6, %ymm10, %ymm2
+; AVX2-NEXT: vpshufb %ymm6, %ymm12, %ymm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,1,0,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
@@ -4279,115 +4245,117 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm8 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT: vpblendd $107, (%rsp), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm7 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,1,2,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[0,0,0,0,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,1,2,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,0,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,6,5,6,4]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4]
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,2,3]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-NEXT: vpshufb %ymm11, %ymm8, %ymm2
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX2-NEXT: vpshufb %ymm1, %ymm7, %ymm2
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[2,1,2,0,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1,2],xmm5[3],xmm2[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm5
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,2,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,0,0,4,5,6,7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,5,6,4]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,6,5,6,4]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5,6],xmm5[7]
+; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm5 = mem[0,1],ymm14[2],mem[3],ymm14[4],mem[5,6],ymm14[7]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $36, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,3,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[2,1,2,0,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm15[1,2],xmm12[3],xmm15[4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-NEXT: vpshufb %ymm11, %ymm2, %ymm11
-; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7],ymm11[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,5,4]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4],xmm11[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,1,1,1,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5,6],xmm7[7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[3,1,2,1,4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-NEXT: vpshufb %ymm7, %ymm8, %ymm8
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm8[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,6,6,6,6]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm1[2,1,2,0,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2],xmm14[3],xmm15[4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm8
+; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm14[0,1,2,3,4],xmm8[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5,6],xmm6[7]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX2-NEXT: vpshufb %ymm6, %ymm7, %ymm8
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7],ymm8[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[3,1,2,1,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2],xmm8[3],xmm3[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm7
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm7[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,7,5,6,5]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5,6],xmm4[7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm4
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm4
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm4[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6],ymm11[7]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,0,2,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm6
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[0,1,0,2,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13]
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX2-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
; AVX2-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7]
; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX2-NEXT: vpshufb %xmm4, %xmm7, %xmm4
+; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm5
; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,0,2,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4],xmm8[5],xmm4[6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm8 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15]
; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
@@ -4413,94 +4381,96 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm6, 32(%rcx)
; AVX2-NEXT: vmovdqa %ymm9, (%rcx)
-; AVX2-NEXT: vmovdqa %ymm5, 32(%r8)
+; AVX2-NEXT: vmovdqa %ymm3, 32(%r8)
; AVX2-NEXT: vmovdqa %ymm0, (%r8)
-; AVX2-NEXT: vmovdqa %ymm4, 32(%r9)
-; AVX2-NEXT: vmovdqa %ymm3, (%r9)
+; AVX2-NEXT: vmovdqa %ymm5, 32(%r9)
+; AVX2-NEXT: vmovdqa %ymm4, (%r9)
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: vmovdqa %ymm2, 32(%rax)
; AVX2-NEXT: vmovdqa %ymm1, (%rax)
-; AVX2-NEXT: addq $488, %rsp # imm = 0x1E8
+; AVX2-NEXT: addq $520, %rsp # imm = 0x208
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i16_stride6_vf32:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: subq $456, %rsp # imm = 0x1C8
-; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm5
-; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7
-; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: subq $520, %rsp # imm = 0x208
+; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm4
+; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm12
; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0
; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1
; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm9
; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm10
; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm2
; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm3
-; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3]
-; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
+; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[2,3],ymm2[2,3]
+; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm3[0,1],ymm2[0,1]
; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
-; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
-; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm6
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm0
+; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm6
+; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm4[2],ymm12[3,4],ymm4[5],ymm12[6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm8, %xmm2, %xmm0
; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,2,2,2,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7]
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5,6,7]
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm8, %ymm6, %ymm5
-; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm11, %ymm6, %ymm4
+; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm7
-; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm8
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,2,2,2,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX2-FP-NEXT: vpshufb %ymm4, %ymm11, %ymm4
-; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm7, %ymm4, %ymm4
-; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpshufb %xmm8, %xmm6, %xmm8
+; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm11
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[2,2,2,2,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3],xmm13[4],xmm8[5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa %ymm15, %ymm4
+; AVX2-FP-NEXT: vmovdqu %ymm15, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm15[1],ymm14[2,3,4,5],ymm15[6],ymm14[7]
+; AVX2-FP-NEXT: vpshufb %ymm7, %ymm13, %ymm7
+; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm8, %ymm7, %ymm5
+; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
; AVX2-FP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpshufb %ymm3, %ymm11, %ymm1
-; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm3
-; AVX2-FP-NEXT: vpshufb %xmm4, %xmm6, %xmm4
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7]
+; AVX2-FP-NEXT: vpshufb %ymm3, %ymm13, %ymm1
+; AVX2-FP-NEXT: vpshufb %xmm7, %xmm11, %xmm3
+; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm15, %xmm5, %xmm0
-; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm1
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3]
-; AVX2-FP-NEXT: vpshufb %xmm15, %xmm11, %xmm1
+; AVX2-FP-NEXT: vpshufb %xmm15, %xmm7, %xmm0
+; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm1
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,1,0,3]
+; AVX2-FP-NEXT: vpshufb %xmm15, %xmm10, %xmm1
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
-; AVX2-FP-NEXT: vpshufb %ymm13, %ymm10, %ymm1
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0],ymm14[1],ymm4[2,3,4,5],ymm14[6],ymm4[7]
+; AVX2-FP-NEXT: vpshufb %ymm13, %ymm9, %ymm1
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm1
; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm2
-; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm14
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm1[2],ymm14[3,4],ymm1[5],ymm14[6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm6
-; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm9
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm9, %xmm8
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm6
+; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm11
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm11, %xmm8
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15]
@@ -4508,187 +4478,188 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm8
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
+; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm0
-; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm6
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm6, %xmm2
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
+; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm2 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5,6],ymm12[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm12
; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
; AVX2-FP-NEXT: vpshufb %xmm15, %xmm2, %xmm0
; AVX2-FP-NEXT: vpshufb %xmm15, %xmm12, %xmm15
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $189, (%rsp), %ymm6, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm15 = mem[0],ymm6[1],mem[2,3,4,5],ymm6[6],mem[7]
-; AVX2-FP-NEXT: vpshufb %ymm13, %ymm15, %ymm13
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm15[1],ymm8[2,3,4,5],ymm15[6],ymm8[7]
+; AVX2-FP-NEXT: vpshufb %ymm13, %ymm4, %ymm13
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm3
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[1,1,1,1,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-FP-NEXT: vpshufb %ymm5, %ymm10, %ymm10
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm3
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7],ymm3[8,9,10],ymm7[11,12,13,14,15]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm5
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[1,1,1,1,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6],xmm7[7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FP-NEXT: vpshufb %ymm7, %ymm9, %ymm9
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm5, %xmm5
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm9, %xmm11, %xmm11
+; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3],xmm11[4,5],xmm3[6],xmm11[7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7],ymm5[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm3
+; AVX2-FP-NEXT: vpshufb %xmm9, %xmm6, %xmm3
; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
-; AVX2-FP-NEXT: vpshufb %ymm5, %ymm15, %ymm3
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[1,1,1,1,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6],xmm4[7]
+; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm3
; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[1,1,1,1,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6],xmm2[7]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm14 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
+; AVX2-FP-NEXT: vpblendd $107, (%rsp), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7]
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,0,3]
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm5, %xmm1
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm6
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
-; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm10, %xmm9
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FP-NEXT: vpshufb %ymm15, %ymm14, %ymm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3,4,5,6,7],ymm12[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm12[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[2,1,0,3]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
-; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm9
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $107, (%rsp), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm11
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
-; AVX2-FP-NEXT: vpshufb %xmm6, %xmm11, %xmm6
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT: vpshufb %ymm15, %ymm13, %ymm6
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm3
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
-; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm10, %xmm4
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FP-NEXT: vpshufb %ymm10, %ymm14, %ymm4
-; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm1
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,2,1]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm1
+; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,6,5,6,4]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX2-FP-NEXT: vpshufb %ymm10, %ymm13, %ymm2
-; AVX2-FP-NEXT: vpshufb %xmm5, %xmm11, %xmm3
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm11 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX2-FP-NEXT: vpshufb %ymm11, %ymm4, %ymm2
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,3,2,1]
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX2-FP-NEXT: vpshufb %xmm9, %xmm10, %xmm7
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0],xmm7[1,2],xmm12[3],xmm7[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FP-NEXT: vpshufb %xmm14, %xmm4, %xmm12
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4],xmm12[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm13[2],ymm7[3,4],ymm13[5],ymm7[6,7]
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,1,0,3]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
+; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm1
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
+; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,6,5,6,4]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5,6],xmm12[7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm15[2],ymm8[3],ymm15[4],ymm8[5,6],ymm15[7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpshufb %ymm11, %ymm12, %ymm8
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm8 = mem[0,1],ymm8[2],mem[3,4],ymm8[5],mem[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm15
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,3,2,1]
+; AVX2-FP-NEXT: vpshufb %xmm9, %xmm15, %xmm9
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1,2],xmm11[3],xmm9[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm14, %xmm12, %xmm11
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm6
+; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5,6],xmm5[7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm11
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm11[0,1,2],ymm5[3,4,5,6,7],ymm11[8,9,10],ymm5[11,12,13,14,15]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm11, %xmm10, %xmm10
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm4
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm5[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
+; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm12, %ymm1
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm12, %xmm1
+; AVX2-FP-NEXT: vpshufb %xmm11, %xmm15, %xmm2
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm7[1],ymm13[2,3],ymm7[4],ymm13[5,6],ymm7[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm5
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm7
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm6
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm9
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5],xmm9[6,7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm8
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
+; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm10
+; AVX2-FP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm10[4],xmm3[5],xmm10[6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm8, %xmm2
-; AVX2-FP-NEXT: vpshufb %xmm9, %xmm7, %xmm7
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm8, %xmm2
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm6, %xmm6
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, (%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rdx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, (%rdx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rcx)
-; AVX2-FP-NEXT: vmovdqa %ymm6, (%rcx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm6, (%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rdx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm6, (%rdx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm6, 32(%rcx)
+; AVX2-FP-NEXT: vmovdqa %ymm9, (%rcx)
; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%r8)
; AVX2-FP-NEXT: vmovdqa %ymm0, (%r8)
; AVX2-FP-NEXT: vmovdqa %ymm3, 32(%r9)
@@ -4696,247 +4667,246 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rax)
; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax)
-; AVX2-FP-NEXT: addq $456, %rsp # imm = 0x1C8
+; AVX2-FP-NEXT: addq $520, %rsp # imm = 0x208
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: load_i16_stride6_vf32:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: subq $456, %rsp # imm = 0x1C8
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm5
+; AVX2-FCP-NEXT: subq $520, %rsp # imm = 0x208
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm4
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
-; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm9
-; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm10
+; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm10
+; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm2
; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm3
-; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm3[2,3],ymm2[2,3]
-; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[0,1],ymm2[0,1]
+; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm3[2,3],ymm2[2,3]
+; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm3[0,1],ymm2[0,1]
; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm1[0,1]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm6
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm0
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm0
; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,2,2,2,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm8[1],xmm0[2,3],xmm8[4],xmm0[5,6,7]
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm11[1],xmm0[2,3],xmm11[4],xmm0[5,6,7]
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm8, %ymm6, %ymm5
-; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm11, %ymm6, %ymm4
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm7
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,2,2,2,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm4
-; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm7, %ymm4, %ymm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm8
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm11
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm11[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3],xmm13[4],xmm8[5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm14, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm15[1],ymm14[2,3,4,5],ymm15[6],ymm14[7]
+; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm7
+; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm8, %ymm7, %ymm4
; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm1
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm1
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm2
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm3
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [6,5,4,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7]
; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm0
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,0,3]
-; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3,4,5],ymm12[6],ymm14[7]
-; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm2
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm13
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm0
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3,4,5],ymm14[6],ymm15[7]
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm14
+; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm15
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm6
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm8
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11
+; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm9
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3],xmm9[4,5],xmm6[6],xmm9[7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0,1,2],ymm6[3,4,5,6,7],ymm0[8,9,10],ymm6[11,12,13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
+; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm0
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
+; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm12
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,0,3]
-; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm0
-; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm15
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm15[2],xmm0[3],xmm15[4,5],xmm0[6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $189, (%rsp), %ymm6, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm15 = mem[0],ymm6[1],mem[2,3,4,5],ymm6[6],mem[7]
-; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm5
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm0
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm10[2],xmm0[3],xmm10[4,5],xmm0[6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm10 = ymm9[0],mem[1],ymm9[2,3,4,5],mem[6],ymm9[7]
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm10, %ymm12
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3,4,5,6,7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm3
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6],xmm5[7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm10
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm10[3,4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,12,13,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6],xmm0[7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm8
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm11
; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3],xmm11[4,5],xmm7[6],xmm11[7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7],ymm3[8,9,10],ymm7[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2],ymm7[3,4,5,6,7],ymm0[8,9,10],ymm7[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm0
; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
-; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm3
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6],xmm4[7]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm1
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7]
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm2
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm14 = ymm14[0,1],mem[2],ymm14[3],mem[4],ymm14[5,6],mem[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm4 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,5]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm3
; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm1
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,3,2,1]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm9
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1,2],xmm12[3],xmm9[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6],xmm0[7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [4,7,6,5]
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm7, %ymm10
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm11
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0],xmm11[1,2],xmm15[3],xmm11[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm14[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm12
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm12[0,1,2,3],xmm14[4],xmm12[5,6],xmm14[7]
+; AVX2-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm12 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm12 = ymm9[0,1],mem[2],ymm9[3],mem[4],ymm9[5,6],mem[7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm8
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm14[3,4,5,6,7],ymm8[8,9,10],ymm14[11,12,13,14,15]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm14 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7]
+; AVX2-FCP-NEXT: vpermd %ymm14, %ymm7, %ymm7
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[2,1,2,3]
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1,2],xmm11[3],xmm6[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm11[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4],xmm5[5,6],xmm3[7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm12
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3,4,5,6,7],ymm12[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm12[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[2,1,0,3]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $107, (%rsp), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
-; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm6
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2],xmm3[3],xmm6[4,5,6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm13, %ymm6
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm6[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm5 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm4
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm11
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3,4,5,6,7],ymm11[8,9,10],ymm3[11,12,13,14,15]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10
; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm4
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm1
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm2
-; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm3
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6],ymm7[7]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm2
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6],ymm13[7]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm5
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm7
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4],xmm7[5],xmm5[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm6
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4],xmm6[5],xmm5[6,7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,3,2,1]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm9
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm9
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm9[4],xmm3[5],xmm9[6,7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
@@ -4948,23 +4918,23 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm2
+; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, (%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rdx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, (%rdx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rcx)
-; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rcx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rsi)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm6, (%rsi)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rdx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm6, (%rdx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm6, 32(%rcx)
+; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rcx)
; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%r8)
; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r8)
; AVX2-FCP-NEXT: vmovdqa %ymm3, 32(%r9)
@@ -4972,231 +4942,229 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rax)
; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax)
-; AVX2-FCP-NEXT: addq $456, %rsp # imm = 0x1C8
+; AVX2-FCP-NEXT: addq $520, %rsp # imm = 0x208
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: load_i16_stride6_vf32:
; AVX512: # %bb.0:
-; AVX512-NEXT: subq $72, %rsp
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
-; AVX512-NEXT: vmovdqa 224(%rdi), %ymm14
-; AVX512-NEXT: vmovdqa 192(%rdi), %ymm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6],ymm14[7]
+; AVX512-NEXT: subq $136, %rsp
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15]
+; AVX512-NEXT: vmovdqa 224(%rdi), %ymm13
+; AVX512-NEXT: vmovdqa 192(%rdi), %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7]
; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm1
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,0,3]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
-; AVX512-NEXT: vmovdqa 160(%rdi), %ymm4
-; AVX512-NEXT: vmovdqa (%rdi), %ymm13
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm10
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm3[2],xmm1[3],xmm3[4,5],xmm1[6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
+; AVX512-NEXT: vmovdqa 160(%rdi), %ymm3
+; AVX512-NEXT: vmovdqa (%rdi), %ymm15
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm11
; AVX512-NEXT: vmovdqa 64(%rdi), %ymm6
; AVX512-NEXT: vmovdqa 128(%rdi), %ymm7
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm4[2],ymm7[3,4],ymm4[5],ymm7[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm7, %ymm24
-; AVX512-NEXT: vmovdqa64 %ymm4, %ymm26
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm1[2,2,2,2,4,5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3,4],xmm4[5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
-; AVX512-NEXT: vpshufb %xmm9, %xmm7, %xmm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm7, %ymm26
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm27
+; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm7
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512-NEXT: vpshufb %xmm8, %xmm3, %xmm9
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4,5],xmm7[6],xmm9[7]
; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm7, %zmm3
-; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vinserti32x4 $2, %xmm4, %zmm7, %zmm4
+; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm6[2,3],mem[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7]
-; AVX512-NEXT: vmovdqa64 %ymm10, %ymm16
-; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm7
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,2,0,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6,7]
-; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm8
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm12[1],ymm8[2,3,4,5],ymm12[6],ymm8[7]
-; AVX512-NEXT: vmovdqa64 %ymm8, %ymm29
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6],ymm11[7]
+; AVX512-NEXT: vmovdqa64 %ymm11, %ymm16
+; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm7
+; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,0,3]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,6,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2],xmm7[3],xmm9[4,5],xmm7[6,7]
+; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm9
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm12[1],ymm9[2,3,4,5],ymm12[6],ymm9[7]
+; AVX512-NEXT: vmovdqa64 %ymm9, %ymm22
; AVX512-NEXT: vmovdqa64 %ymm12, %ymm28
-; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7]
-; AVX512-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 352(%rdi), %ymm8
+; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7]
+; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa 352(%rdi), %ymm9
; AVX512-NEXT: vmovdqa 320(%rdi), %ymm10
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm10, %ymm18
-; AVX512-NEXT: vmovdqa64 %ymm8, %ymm20
-; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3,4],xmm8[5,6,7]
-; AVX512-NEXT: vpshufb %xmm9, %xmm10, %xmm9
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm10
-; AVX512-NEXT: vmovdqa 256(%rdi), %ymm9
-; AVX512-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm9[2,3],mem[2,3]
-; AVX512-NEXT: vinserti128 $1, 288(%rdi), %ymm9, %ymm15
-; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0],ymm12[1],ymm15[2,3,4,5],ymm12[6],ymm15[7]
-; AVX512-NEXT: vmovdqa64 %ymm15, %ymm25
-; AVX512-NEXT: vmovdqa64 %ymm12, %ymm27
-; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm10, %ymm19
+; AVX512-NEXT: vmovdqa64 %ymm9, %ymm20
+; AVX512-NEXT: vpshufb %xmm8, %xmm7, %xmm10
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX512-NEXT: vpshufb %xmm8, %xmm9, %xmm8
+; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm10[3],xmm8[4,5],xmm10[6],xmm8[7]
+; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm10
+; AVX512-NEXT: vmovdqa 256(%rdi), %ymm8
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm8[2,3],mem[2,3]
+; AVX512-NEXT: vinserti128 $1, 288(%rdi), %ymm8, %ymm12
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2,3,4,5],ymm11[6],ymm12[7]
+; AVX512-NEXT: vmovdqa64 %ymm12, %ymm24
+; AVX512-NEXT: vmovdqa64 %ymm11, %ymm25
+; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,28,29,24,25,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,6]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm10, %ymm30
+; AVX512-NEXT: vmovdqa64 %ymm10, %ymm29
; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm0
; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm4
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vpshufb %xmm10, %xmm5, %xmm0
-; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm1
+; AVX512-NEXT: vpshufb %xmm10, %xmm4, %xmm1
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm0
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,5,5,5,5]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,30,31,26,27,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm13, %ymm30
; AVX512-NEXT: vmovdqa64 %ymm14, %ymm31
-; AVX512-NEXT: vmovdqa64 %ymm11, %ymm21
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[0,3,2,1]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,0,2,3,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,1,2,0,4,5,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm24, %ymm1
-; AVX512-NEXT: vmovdqa64 %ymm26, %ymm2
+; AVX512-NEXT: vmovdqa64 %ymm26, %ymm1
+; AVX512-NEXT: vmovdqa64 %ymm27, %ymm2
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[2,1,0,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,0,0,0,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,1,2,1]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,6,5,6,4]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17
; AVX512-NEXT: vmovdqa64 %ymm16, %ymm23
; AVX512-NEXT: vmovdqa64 %ymm16, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm13, %ymm19
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2],ymm15[3,4],ymm0[5],ymm15[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm15, %ymm21
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,3,2,1]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm29, %ymm13
-; AVX512-NEXT: vmovdqa64 %ymm28, %ymm12
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512-NEXT: vmovdqa64 %ymm22, %ymm13
+; AVX512-NEXT: vmovdqa64 %ymm28, %ymm11
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm11[2],ymm13[3],ymm11[4],ymm13[5,6],ymm11[7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm18, %ymm28
-; AVX512-NEXT: vmovdqa64 %ymm20, %ymm29
-; AVX512-NEXT: vmovdqa64 %ymm18, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm20, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm20, %ymm15
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[0,0,0,0,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm2[0,1,2,3,6,5,6,4]
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm14
-; AVX512-NEXT: vmovdqa64 %ymm25, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm27, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm17 ^ (zmm0 & (zmm2 ^ zmm17))
+; AVX512-NEXT: vmovdqa64 %ymm24, %ymm1
+; AVX512-NEXT: vmovdqa64 %ymm25, %ymm12
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2],ymm1[3],ymm12[4],ymm1[5,6],ymm12[7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm14[3,4,5,6,7],ymm12[8,9,10],ymm14[11,12,13,14,15]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm16
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm17 ^ (zmm12 & (zmm0 ^ zmm17))
; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm2))
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[3,1,2,1,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm11[0,1,3,3,4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,7,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2],xmm2[3],xmm10[4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,1,1,1,4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7]
-; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2
-; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm30
-; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm18
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm0))
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[3,1,2,1,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,3,3,4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,7,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2],xmm0[3],xmm9[4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,1,1,1,4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7]
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm22
+; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18
; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm2 ^ (zmm0 & (zmm5 ^ zmm2))
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm12 & (zmm4 ^ zmm0))
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm17 & (zmm20 ^ zmm5))
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512-NEXT: vmovdqa64 %ymm31, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm21, %ymm1
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm17 & (zmm20 ^ zmm4))
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm30, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm31, %ymm1
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm2
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm24, %ymm3
-; AVX512-NEXT: vmovdqa64 %ymm26, %ymm4
+; AVX512-NEXT: vmovdqa64 %ymm26, %ymm3
+; AVX512-NEXT: vmovdqa64 %ymm27, %ymm4
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7]
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,3,2,1]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
-; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13]
; AVX512-NEXT: vpshufb %xmm9, %xmm4, %xmm5
; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6,7]
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm3, %zmm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4,5],ymm13[6],ymm12[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm13[1],ymm11[2,3,4,5],ymm13[6],ymm11[7]
; AVX512-NEXT: vmovdqa64 %ymm23, %ymm3
-; AVX512-NEXT: vmovdqa64 %ymm19, %ymm8
+; AVX512-NEXT: vmovdqa64 %ymm21, %ymm8
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7]
; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm8
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7
; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,2,2,2,4,5,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6,7]
; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = (ymm8 & ymm11) | ymm10
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = (ymm8 & ymm14) | ymm10
; AVX512-NEXT: movw $31, %ax
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1}
-; AVX512-NEXT: vmovdqa64 %ymm28, %ymm8
-; AVX512-NEXT: vmovdqa64 %ymm29, %ymm10
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5,6],ymm10[7]
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm8
+; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6],ymm15[7]
; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm8
; AVX512-NEXT: vpshufb %xmm9, %xmm8, %xmm12
; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[0,3,2,1]
@@ -5204,30 +5172,30 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6]
; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6,7]
; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT: vmovdqa64 %ymm25, %ymm12
-; AVX512-NEXT: vmovdqa64 %ymm27, %ymm13
+; AVX512-NEXT: vmovdqa64 %ymm24, %ymm12
+; AVX512-NEXT: vmovdqa64 %ymm25, %ymm13
; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4],ymm10[5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX512-NEXT: vpshufb %xmm14, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm15, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
-; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15]
; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,3]
; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0
-; AVX512-NEXT: vpshufb %xmm14, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm15, %xmm3, %xmm3
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm11) | ymm4
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm14) | ymm4
; AVX512-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1}
; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1
; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7]
@@ -5238,17 +5206,18 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
; AVX512-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
; AVX512-NEXT: movw $-2048, %ax # imm = 0xF800
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovdqa32 %zmm30, %zmm4 {%k1}
+; AVX512-NEXT: vmovdqa32 %zmm22, %zmm4 {%k1}
; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi)
-; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm15 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm15 = mem ^ (zmm3 & (zmm15 ^ mem))
-; AVX512-NEXT: vmovdqa32 %zmm18, %zmm15 {%k1}
-; AVX512-NEXT: vmovdqa64 %zmm15, (%rdx)
+; AVX512-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
+; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
+; AVX512-NEXT: vmovdqa32 %zmm18, %zmm4 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm4, (%rdx)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm17 & (zmm10 ^ zmm2))
; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm17 & (zmm1 ^ zmm0))
@@ -5256,541 +5225,536 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa64 %zmm20, (%r8)
; AVX512-NEXT: vmovdqa64 %zmm10, (%r9)
; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512-NEXT: addq $72, %rsp
+; AVX512-NEXT: addq $136, %rsp
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride6_vf32:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $136, %rsp
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
-; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm13
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6],ymm13[7]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm2
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[2,1,0,3]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm10
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
+; AVX512-FCP-NEXT: subq $200, %rsp
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [6,5,4,7]
+; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm15
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm26
+; AVX512-FCP-NEXT: vpermd %ymm14, %ymm6, %ymm1
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm5
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm5, %zmm4
-; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0],ymm10[1],ymm0[2,3],ymm10[4],ymm0[5,6],ymm10[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm16
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm8
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,0,3]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4,5],xmm8[6,7]
-; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3]
-; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm6, %ymm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm0[1],ymm8[2,3,4,5],ymm0[6],ymm8[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm27
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm7
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm8[3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm0
-; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm19
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm20
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm10
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm9
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm8
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm10
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3],xmm10[4,5],xmm8[6],xmm10[7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm8, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6],ymm12[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm23
+; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm12
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm10
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm8
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm4[2],xmm10[3],xmm4[4,5],xmm10[6,7]
+; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm7[2,3],mem[2,3]
+; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm7, %ymm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm11[1],ymm7[2,3,4,5],ymm11[6],ymm7[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm25
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm6
+; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm10
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm9
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm10
; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm9
-; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm9[2,3],mem[2,3]
-; AVX512-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm9, %ymm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0],ymm0[1],ymm14[2,3,4,5],ymm0[6],ymm14[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm24
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm25
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm10[2],xmm7[3],xmm10[4,5],xmm7[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm9[2,3],mem[2,3]
+; AVX512-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm9, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm13[1],ymm0[2,3,4,5],ymm13[6],ymm0[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
+; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm27
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,28,29,24,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0,1,2],ymm10[3,4,5,6,7],ymm13[8,9,10],ymm10[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm30
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm2, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm0
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm0
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,5,5,5,5]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,30,31,26,27,u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm13[2],ymm15[3,4],ymm13[5],ymm15[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm30
-; AVX512-FCP-NEXT: vmovdqa64 %ymm15, %ymm31
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,1,2,3]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,3,2,1]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,7,6,5]
; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4],ymm12[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm21
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm17
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm23
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm15, %ymm31
+; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm9
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [10,1,10,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX512-FCP-NEXT: vpermd 192(%rdi), %zmm2, %zmm8
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm19
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm20 = [2,9,8,11]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [12,5,14,5]
+; AVX512-FCP-NEXT: vpermd %zmm19, %zmm20, %zmm7
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm4
+; AVX512-FCP-NEXT: vpermd %zmm19, %zmm16, %zmm6
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,6,5,6,4]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5,6],xmm5[7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm17
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm24
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm5
+; AVX512-FCP-NEXT: vpermd (%rdi), %zmm2, %zmm4
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm14
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm12[2],ymm14[3],ymm12[4],ymm14[5,6],ymm12[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm10
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7]
+; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm15
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm27
-; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm15
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2],ymm15[3,4],ymm1[5],ymm15[6,7]
-; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm1, %xmm16
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,1,0,3]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm16[0,1,2,1]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,6,5,6,4]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13
-; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm17 ^ (zmm0 & (zmm11 ^ zmm17))
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm21
+; AVX512-FCP-NEXT: vpermd %zmm21, %zmm20, %zmm2
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
+; AVX512-FCP-NEXT: vpermd %zmm21, %zmm16, %zmm1
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,6,5,6,4]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5,6],xmm12[7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm11
+; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2],ymm11[3],ymm0[4],ymm11[5,6],ymm0[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0,1,2],ymm12[3,4,5,6,7],ymm14[8,9,10],ymm12[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm16
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm17 ^ (zmm12 & (zmm13 ^ zmm17))
; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm11))
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm13))
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm7, %zmm7
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm19
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm28
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm6[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm7 ^ (zmm0 & (zmm4 ^ zmm7))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm17 & (zmm20 ^ zmm4))
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm1
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2],xmm8[3],xmm9[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5,6],xmm6[7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm6
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm20
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm6 ^ (zmm12 & (zmm3 ^ zmm6))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm22
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm17 & (zmm22 ^ zmm3))
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm1
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm0
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm0
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,2,2,2,4,5,6,7]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm6
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,1]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm7
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0,1,2,3],xmm2[4],xmm7[5],xmm2[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,3,2,9]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm2
+; AVX512-FCP-NEXT: vpermd %zmm19, %zmm12, %zmm3
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm2[4],xmm9[5],xmm2[6,7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm10[1],xmm5[2,3],xmm10[4],xmm5[5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm11) | ymm10
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm10[1],ymm15[2,3,4,5],ymm10[6],ymm15[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm4
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm8
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm13[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2,3],xmm14[4],xmm8[5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (ymm8 & ymm15) | ymm14
; AVX512-FCP-NEXT: movw $31, %ax
; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1}
+; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1}
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm8 = mem[0],ymm4[1],mem[2,3],ymm4[4],mem[5,6],ymm4[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm4
+; AVX512-FCP-NEXT: vpermd %zmm21, %zmm12, %zmm14
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm14, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm15[1],ymm5[2,3],ymm15[4],ymm5[5,6],ymm15[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm5
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm13
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,3,2,1]
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm13[4],xmm8[5],xmm13[6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm12
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm13
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm12[1],ymm13[2,3,4,5],ymm12[6],ymm13[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm9
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm9[1],xmm7[2,3],xmm9[4],xmm7[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4,5],ymm11[6],ymm5[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm5[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm13, %xmm13
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3],xmm13[4],xmm9[5,6,7]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm7 & ymm11)
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6,7]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm9 & ymm15)
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm6
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3],xmm6[4],xmm1[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm7[4],xmm3[5],xmm7[6,7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1
; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm3
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm14, %xmm3
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $226, (%rsp), %zmm3, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800
; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vmovdqa32 %zmm19, %zmm4 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
-; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm4 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa32 %zmm20, %zmm5 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi)
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm5 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm17 & (zmm8 ^ zmm2))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm17 & (zmm4 ^ zmm2))
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm17 & (zmm0 ^ zmm1))
; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9)
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512-FCP-NEXT: addq $136, %rsp
+; AVX512-FCP-NEXT: addq $200, %rsp
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride6_vf32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: pushq %rax
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
-; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm13
-; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm25
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm12
+; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm13
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7]
; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm1
-; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm9
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,2,0,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
-; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm2
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm5
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm6
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
+; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm1
+; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm9
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm10
; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4
-; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm7
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm20
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm22
-; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3,4],xmm15[5,6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm4[2,3],mem[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6],ymm6[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm18
-; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm19
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm6
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,0,3]
+; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm22
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm24
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm14, %xmm5
+; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm11
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm11, %xmm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3],xmm8[4,5],xmm5[6],xmm8[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm5, %zmm16
+; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm4[2,3],mem[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm17
+; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm19
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm15, %xmm0
+; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm2
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,2,0,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2],xmm0[3],xmm5[4,5],xmm0[6,7]
-; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm12
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm28
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm5[3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm0
-; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm8[1],ymm1[2,3,4,5],ymm8[6],ymm1[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm26
+; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm27
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm5[3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm5
+; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm20
; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm21
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm4, %xmm10
+; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm6
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm10[3],xmm6[4,5],xmm10[6],xmm6[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
+; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm6
+; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm6[2,3],mem[2,3]
+; AVX512DQ-NEXT: vinserti128 $1, 288(%rdi), %ymm6, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm10[1],ymm0[2,3,4,5],ymm10[6],ymm0[7]
; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23
-; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0,1,2],xmm11[3,4],xmm5[5,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm11, %xmm7
-; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm8
-; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm7
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm7[2,3],mem[2,3]
-; AVX512DQ-NEXT: vinserti128 $1, 288(%rdi), %ymm7, %ymm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm24
-; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm26
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm16 ^ (zmm17 & (zmm10 ^ zmm16))
+; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm25
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,28,29,24,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7],ymm10[8,9,10],ymm8[11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm16 ^ (zmm18 & (zmm1 ^ zmm16))
; AVX512DQ-NEXT: movw $-2048, %ax # imm = 0xF800
; AVX512DQ-NEXT: kmovw %eax, %k1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
-; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm8
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm8[2],xmm3[3],xmm8[4,5],xmm3[6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm15, %xmm8
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm7, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3],xmm7[4,5],xmm3[6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512DQ-NEXT: vpshufb %xmm7, %xmm11, %xmm1
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm14[3],xmm8[4,5],xmm14[6],xmm8[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm8, %zmm3
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2],xmm0[3],xmm6[4,5],xmm0[6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm5, %xmm0
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm17 & (zmm2 ^ zmm3))
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm14[3],xmm1[4,5],xmm14[6],xmm1[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm15, %xmm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb %xmm7, %xmm5, %xmm2
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5],xmm0[6],xmm2[7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,30,31,26,27,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm18 & (zmm3 ^ zmm1))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7]
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
+; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm28
; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm29
-; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm30
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3]
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,0,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm2
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[2,1,0,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,0,0,0,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,2,1]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,4]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm16
-; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm25
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm13
-; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm30
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,2,3]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,1]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,0,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,1,2,0,4,5,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm10
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm9
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm13
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4],ymm0[5],ymm13[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,6,5,6,4]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11
-; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm0 & (zmm9 ^ zmm16))
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm12
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3],ymm12[4],ymm0[5,6],ymm12[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7],ymm12[8,9,10],ymm11[11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm18
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm16 ^ (zmm11 & (zmm10 ^ zmm16))
; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm17 & (zmm18 ^ zmm9))
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm14[1,1,1,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3],xmm7[4],xmm9[5,6],xmm7[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm17 & (zmm18 ^ zmm10))
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[3,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1,2],xmm10[3],xmm7[4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[1,1,1,1,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4],xmm10[5,6],xmm6[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm7, %zmm6, %zmm6
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,1,4,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5,6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm4[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4],xmm3[5,6],xmm1[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm7 ^ (zmm0 & (zmm4 ^ zmm7))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm4))
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm1
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm6 ^ (zmm11 & (zmm3 ^ zmm6))
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm3))
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm2
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[2,2,2,2,4,5,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm2
-; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm4
+; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm4
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,3,2,1]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,3,2,1]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,0,2,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,6,6,6]
-; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm2, %xmm5
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13]
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm5
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm4, %zmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2],ymm5[3,4],ymm13[5],ymm5[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm6
-; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm13
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm13[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm9[1],ymm8[2,3,4,5],ymm9[6],ymm8[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm5
+; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm7
+; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,2,2,2,4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm14) | ymm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = (ymm7 & ymm14) | ymm12
; AVX512DQ-NEXT: movw $31, %ax
; AVX512DQ-NEXT: kmovw %eax, %k1
-; AVX512DQ-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm4 {%k1}
-; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm6
-; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm8
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm8[1],ymm6[2,3],ymm8[4],ymm6[5,6],ymm8[7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm6
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm11
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm8[0,3,2,1]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm11[4],xmm8[5],xmm11[6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm11
-; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm10
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0],ymm10[1],ymm11[2,3,4,5],ymm10[6],ymm11[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4],ymm8[5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm8
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vinserti32x8 $0, %ymm7, %zmm0, %zmm4 {%k1}
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0],ymm13[1],ymm7[2,3],ymm13[4],ymm7[5,6],ymm13[7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm7, %xmm10
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,3,2,1]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm13[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm12
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm9
+; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm10
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm9[1],ymm10[2,3,4,5],ymm9[6],ymm10[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpshufb %xmm15, %xmm0, %xmm0
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
-; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15]
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,3,3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0,1,2,3],xmm2[4],xmm9[5],xmm2[6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1,2,3],xmm2[4],xmm11[5],xmm2[6,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0
; AVX512DQ-NEXT: vpshufb %xmm15, %xmm5, %xmm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3],xmm5[4],xmm2[5,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm14) | ymm3
; AVX512DQ-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[0,1,1,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,3]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
@@ -5798,11 +5762,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm17 & (zmm8 ^ zmm4))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm17 & (zmm12 ^ zmm4))
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm17 & (zmm1 ^ zmm0))
; AVX512DQ-NEXT: vmovdqa64 %zmm18, (%rcx)
; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%r8)
-; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r9)
+; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%r9)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512DQ-NEXT: popq %rax
; AVX512DQ-NEXT: vzeroupper
@@ -5810,251 +5774,248 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-FCP-LABEL: load_i16_stride6_vf32:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: pushq %rax
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm13
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0],ymm13[1],ymm2[2,3],ymm13[4],ymm2[5,6],ymm13[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm24
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: subq $40, %rsp
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [6,5,4,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm13
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm3, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm9
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm25
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm14, %xmm4
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4,5],xmm4[6],xmm8[7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm16
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0],ymm10[1],ymm2[2,3],ymm10[4],ymm2[5,6],ymm10[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm18
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm4
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3],xmm0[4,5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm3[2,3],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm3, %ymm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm1[1],ymm12[2,3,4,5],ymm1[6],ymm12[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm28
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm25
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm26
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm6
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm16
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm21
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm17
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm9
+; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0,1],xmm1[2],xmm9[3],xmm1[4,5],xmm9[6,7]
+; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm8 = ymm4[2,3],mem[2,3]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm8[1],ymm4[2,3,4,5],ymm8[6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm27
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm28
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2],ymm4[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2],ymm4[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm4
; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm11
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm11[3],xmm6[4,5],xmm11[6],xmm6[7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm6[2,3],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm6, %ymm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0],ymm0[1],ymm11[2,3,4,5],ymm0[6],ymm11[7]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm10
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3],xmm0[4,5],xmm10[6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],mem[2,3]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm0, %ymm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3,4,5],ymm10[6],ymm11[7]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm23
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm26
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2],ymm8[3,4,5,6,7],ymm0[8,9,10],ymm8[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm16 ^ (zmm17 & (zmm10 ^ zmm16))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm24
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,28,29,24,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7],ymm11[8,9,10],ymm8[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm11 & (zmm9 ^ zmm16))
; AVX512DQ-FCP-NEXT: movw $-2048, %ax # imm = 0xF800
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2],xmm5[3],xmm8[4,5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,5,5,5,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3],xmm7[4,5],xmm8[6],xmm7[7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm7, %zmm5
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3],xmm0[4,5],xmm7[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm17 & (zmm2 ^ zmm5))
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4],ymm13[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm29
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm30
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,2,3]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[2,1,2,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm24
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm13
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2],ymm13[3,4],ymm1[5],ymm13[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[2,1,2,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm10[2],ymm12[3],ymm10[4],ymm12[5,6],ymm10[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm8, %zmm0, %zmm9 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3],xmm7[4,5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm15
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,5,5,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3],xmm15[4,5],xmm14[6],xmm15[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm14, %zmm5
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3],xmm6[4,5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,30,31,26,27,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (zmm11 & (zmm3 ^ zmm5))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,7,6,5]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm30
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm31
+; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm15
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [10,1,10,3]
+; AVX512DQ-FCP-NEXT: vpermd 192(%rdi), %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm16
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [2,9,8,11]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [12,5,14,5]
+; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm18, %zmm14
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm4
+; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm19, %zmm5
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm5[0,1,2,3,6,5,6,4]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm10[4],xmm4[5,6],xmm10[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm4, %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm29
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm1, %xmm17
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,1,0,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm17[0,1,2,1]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,6,5,6,4]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5,6],xmm11[7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2],ymm11[3,4,5,6,7],ymm0[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT: vpermd (%rdi), %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,0,1,4,5,u,u,12,13,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm17
+; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm18, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm19, %zmm7
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm12 = xmm7[0,1,2,3,6,5,6,4]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5,6],xmm12[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm13
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3],ymm13[4],ymm0[5,6],ymm13[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,16,17,28,29,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4,5,6,7],ymm13[8,9,10],ymm12[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm0[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm20
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm10 ^ (zmm12 & (zmm11 ^ zmm10))
+; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm19 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm19 & (zmm20 ^ zmm11))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm15[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm10[1,2],xmm6[3],xmm10[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm11
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm11[0,1,2,3],xmm5[4],xmm11[5,6],xmm5[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,30,31,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm5 ^ (zmm12 & (zmm2 ^ zmm5))
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm18
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm16 ^ (zmm0 & (zmm9 ^ zmm16))
-; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm17 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm17 & (zmm18 ^ zmm9))
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm11
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5,6],xmm7[7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,2,3,6,7,u,u,14,15,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2],xmm5[3],xmm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6],xmm2[7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm0 & (zmm3 ^ zmm7))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm16
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm19 & (zmm18 ^ zmm2))
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm1
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm1
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,2,2,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,3,2,9]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm2
+; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm11
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm11[0,1,2,3],xmm2[4],xmm11[5],xmm2[6,7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm5
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[2,2,2,2,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm13) | ymm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4,5],ymm9[6],ymm8[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm6
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,5,0,1,12,13,24,25,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm14) | ymm13
; AVX512DQ-FCP-NEXT: movw $31, %ax
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm11[1],ymm5[2,3],ymm11[4],ymm5[5,6],ymm11[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm14
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm14[4],xmm7[5],xmm14[6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4,5],ymm10[6],ymm14[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm14[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3,4],ymm7[5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm6, %zmm0, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm6 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6],ymm9[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm13
+; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm10, %zmm10
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm13[4],xmm5[5],xmm13[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm13
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0],ymm9[1],ymm13[2,3,4,5],ymm9[6],ymm13[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3],xmm9[4],xmm8[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3],xmm12[4],xmm11[5,6,7]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm8 & ymm13)
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm11 & ymm14)
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4
; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm7[4],xmm1[5],xmm7[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1
; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm3
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
@@ -6062,13 +6023,13 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm17 & (zmm7 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm17 & (zmm2 ^ zmm1))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%r9)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm19 & (zmm5 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm19 & (zmm2 ^ zmm1))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r9)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
-; AVX512DQ-FCP-NEXT: popq %rax
+; AVX512DQ-FCP-NEXT: addq $40, %rsp
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -8608,7 +8569,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX2-LABEL: load_i16_stride6_vf64:
; AVX2: # %bb.0:
-; AVX2-NEXT: subq $1272, %rsp # imm = 0x4F8
+; AVX2-NEXT: subq $1224, %rsp # imm = 0x4C8
; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1
; AVX2-NEXT: vmovaps 672(%rdi), %ymm2
@@ -8621,58 +8582,58 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa 480(%rdi), %ymm6
; AVX2-NEXT: vmovdqa 448(%rdi), %ymm7
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3]
-; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
-; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3]
-; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm7[2,3],ymm6[2,3]
+; AVX2-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[0,1],ymm6[0,1]
+; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm5[2,3],ymm4[2,3]
+; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1]
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1]
; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm0[2,3],ymm1[2,3]
+; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm0[0,1],ymm1[0,1]
; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1]
-; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm0
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
-; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm7
+; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm6[1],ymm11[2,3,4,5],ymm6[6],ymm11[7]
+; AVX2-NEXT: vpshufb %ymm2, %ymm4, %ymm6
; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3
+; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm6, %ymm3
; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa (%rdi), %ymm3
; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm7
-; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
-; AVX2-NEXT: vpshufb %xmm6, %xmm9, %xmm3
-; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm11
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,2,2,2,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3],xmm7[4],xmm3[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX2-NEXT: vpshufb %ymm2, %ymm10, %ymm7
-; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6
+; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm3[2],ymm6[3,4],ymm3[5],ymm6[6,7]
+; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm3
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm11
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm12[1],ymm10[2,3,4,5],ymm12[6],ymm10[7]
+; AVX2-NEXT: vpshufb %ymm2, %ymm9, %ymm8
+; AVX2-NEXT: vpblendvb %ymm0, %ymm3, %ymm8, %ymm3
; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7
-; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa 224(%rdi), %ymm8
+; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa 192(%rdi), %ymm3
; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
-; AVX2-NEXT: vpshufb %xmm6, %xmm3, %xmm7
-; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm8
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm8[2,2,2,2,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm7[0],xmm12[1],xmm7[2,3],xmm12[4],xmm7[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
+; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8
+; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm10
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,2,2,2,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7]
; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7]
-; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm13
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0],ymm15[1],ymm14[2,3,4,5],ymm15[6],ymm14[7]
+; AVX2-NEXT: vpshufb %ymm2, %ymm8, %ymm13
; AVX2-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12
; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa 608(%rdi), %ymm13
@@ -8680,9 +8641,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vmovdqa 576(%rdi), %ymm12
; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
-; AVX2-NEXT: vpshufb %xmm6, %xmm12, %xmm13
-; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm6
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[2,2,2,2,4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm7, %xmm12, %xmm13
+; AVX2-NEXT: vextracti128 $1, %ymm12, %xmm7
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm7[2,2,2,2,4,5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
@@ -8691,31 +8652,31 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2
; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
-; AVX2-NEXT: vpshufb %xmm2, %xmm9, %xmm9
+; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm6
; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,2,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0],xmm11[1],xmm9[2,3],xmm11[4],xmm9[5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
-; AVX2-NEXT: vpshufb %ymm9, %ymm10, %ymm10
-; AVX2-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10
-; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm6[0],xmm11[1],xmm6[2,3],xmm11[4],xmm6[5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
+; AVX2-NEXT: vpshufb %ymm6, %ymm9, %ymm9
+; AVX2-NEXT: vpblendvb %ymm0, %ymm11, %ymm9, %ymm9
+; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7]
-; AVX2-NEXT: vpshufb %ymm9, %ymm4, %ymm4
+; AVX2-NEXT: vpshufb %ymm6, %ymm4, %ymm4
; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,1,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,2,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7]
-; AVX2-NEXT: vpshufb %ymm9, %ymm7, %ymm3
+; AVX2-NEXT: vpshufb %ymm6, %ymm8, %ymm3
; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpshufb %ymm9, %ymm13, %ymm1
+; AVX2-NEXT: vpshufb %ymm6, %ymm13, %ymm1
; AVX2-NEXT: vpshufb %xmm2, %xmm12, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0
@@ -8727,11 +8688,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15]
; AVX2-NEXT: vpshufb %xmm10, %xmm5, %xmm1
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
; AVX2-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7]
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8845,7 +8806,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2],ymm13[3,4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX2-NEXT: vpshufb %xmm5, %xmm15, %xmm15
; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,5,5,5]
@@ -8906,324 +8867,326 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,1,2,1]
+; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
+; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,5,6,4]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX2-NEXT: vpshufb %ymm6, %ymm5, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,1,2,3]
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3]
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[0,3,2,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-NEXT: vpshufb %xmm10, %xmm5, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
; AVX2-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
-; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
+; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[0,1,2,1]
; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,4]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
-; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm8, %ymm3, %ymm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,5,6,4]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,1,2,3]
+; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
+; AVX2-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[2,1,2,0,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm4
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm12 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,2,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,0,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,0,0,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,5,6,4]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,2,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,1,0,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT: vpshufb %ymm6, %ymm12, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,2,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,3,2,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm1[0,0,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,1,2,0,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1,2],xmm13[3],xmm12[4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-NEXT: vpshufb %ymm8, %ymm9, %ymm13
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0,1,2],ymm4[3,4,5,6,7],ymm13[8,9,10],ymm4[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,6,5,4]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm1[0,0,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,6,6,6]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,1,2,0,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0],xmm11[1,2],xmm14[3],xmm11[4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm10, %xmm12, %xmm14
+; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm14[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm7[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm12
-; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,1,0,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[0,0,0,0,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,4,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm12[0,1,2,3,6,5,6,4]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6],xmm13[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm15 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm15 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm4 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[2,1,2,3]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[0,0,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,3,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm13[2,1,2,0,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1,2],xmm11[3],xmm10[4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb %ymm8, %ymm15, %ymm8
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,5,4]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm7 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm11
+; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm11[0,1,2,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,1,0,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[0,0,0,0,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,6,5,6,4]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm11[4],xmm7[5,6],xmm11[7]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-NEXT: vpshufb %ymm6, %ymm11, %ymm6
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm7[3,4,5,6,7],ymm6[8,9,10],ymm7[11,12,13,14,15]
+; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX2-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm6
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm6[0,0,2,3,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,6,6,6,6]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,1,2,0,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1,2],xmm10[3],xmm13[4,5,6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm11[4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm13[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; AVX2-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5]
-; AVX2-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm8 = mem[1,1,1,1,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm0[4],xmm8[5,6],xmm0[7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2],xmm7[3],xmm6[4,5,6,7]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX2-NEXT: vpshufb %ymm7, %ymm8, %ymm8
+; AVX2-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm10 = mem[1,1,1,1,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2,3],xmm0[4],xmm10[5,6],xmm0[7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7],ymm8[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,3,2]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm8[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm13 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX2-NEXT: vpshufb %ymm13, %ymm5, %ymm10
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm10 = mem[3,1,2,1,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,3,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,7,7,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm8[1,2],xmm10[3],xmm8[4,5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm5
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3,4],xmm5[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,5,6,5]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,1,1,1,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6],xmm0[7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,5,6,5]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[1,1,1,1,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6],xmm0[7]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb %ymm13, %ymm12, %ymm4
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX2-NEXT: vpshufb %ymm7, %ymm9, %ymm2
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5]
-; AVX2-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7]
-; AVX2-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm1 = mem[3,1,2,1,4,5,6,7]
-; AVX2-NEXT: vpshuflw $244, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm2 = mem[0,1,3,3,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2],xmm1[3],xmm2[4,5,6,7]
-; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
+; AVX2-NEXT: vpshufb %xmm8, %xmm12, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm12[0,1,2,3,7,5,6,5]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[1,1,1,1,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7]
-; AVX2-NEXT: vpshufb %ymm7, %ymm15, %ymm1
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[3,1,2,1,4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,3,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm1 = mem[0,1,2,3,7,5,6,5]
+; AVX2-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm2 = mem[1,1,1,1,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5,6],xmm1[7]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT: vpshufb %ymm13, %ymm3, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX2-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm2 = mem[3,1,2,1,4,5,6,7]
+; AVX2-NEXT: vpshuflw $244, (%rsp), %xmm4 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm4 = mem[0,1,3,3,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm14[0,1,2,3,7,5,6,5]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[1,1,1,1,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT: vpshufb %ymm13, %ymm11, %ymm3
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm8, %xmm11, %xmm3
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,1,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,1,3,3,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2],xmm4[3],xmm5[4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
-; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm3 = mem[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm6
+; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[0,3,2,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[0,1,0,2,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13]
+; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm4
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4],ymm2[5,6,7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm11
-; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,3,2,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,0,2,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; AVX2-NEXT: vpshufb %xmm9, %xmm11, %xmm4
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm11
+; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,0,2,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-NEXT: vpshufb %xmm8, %xmm11, %xmm5
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4],xmm2[5],xmm5[6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm5 = mem[0,1,2,3,4],ymm2[5,6,7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm13
-; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[0,3,2,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,0,2,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
-; AVX2-NEXT: vpshufb %xmm9, %xmm13, %xmm10
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5],xmm10[6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm2 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm13
+; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[0,3,2,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[0,1,0,2,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-NEXT: vpshufb %xmm8, %xmm13, %xmm9
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm9[4],xmm2[5],xmm9[6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm15
-; AVX2-NEXT: vpshufb %xmm9, %xmm15, %xmm9
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[0,1,0,2,4,5,6,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,6,6,6]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4],xmm10[5],xmm9[6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm10 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm10 = mem[0,1,2,3,4],ymm9[5,6,7]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm9 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
-; AVX2-NEXT: vpshufb %xmm9, %xmm11, %xmm11
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7]
-; AVX2-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX2-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm9 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm15
+; AVX2-NEXT: vpshufb %xmm8, %xmm15, %xmm8
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,3,2,1]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,1,0,2,4,5,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,6,6,6]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5],xmm8[6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm9 = mem[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15]
+; AVX2-NEXT: vpshufb %xmm1, %xmm11, %xmm8
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[0,1,1,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1,2,3],xmm8[4],xmm11[5],xmm8[6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm8 = mem[0,1,2,3,4],ymm8[5,6,7]
+; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6
; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4],xmm7[5],xmm1[6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-NEXT: vpshufb %xmm9, %xmm13, %xmm7
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm14[0,1,1,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,3,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2,3],xmm7[4],xmm12[5],xmm7[6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
+; AVX2-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
+; AVX2-NEXT: vpshufb %xmm1, %xmm13, %xmm7
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[0,1,1,3,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,3]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5],xmm7[6,7]
; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
; AVX2-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-NEXT: vpshufb %xmm9, %xmm15, %xmm9
+; AVX2-NEXT: vpshufb %xmm1, %xmm15, %xmm1
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6,7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm9, 96(%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm9, 32(%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm9, 64(%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm9, (%rsi)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm9, 96(%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm9, 32(%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm9, 64(%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm9, (%rdx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm9, 32(%rcx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm9, 96(%rcx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm9, 64(%rcx)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm9, (%rcx)
-; AVX2-NEXT: vmovdqa %ymm6, 96(%r8)
-; AVX2-NEXT: vmovdqa %ymm8, 32(%r8)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm6, 64(%r8)
-; AVX2-NEXT: vmovdqa %ymm5, (%r8)
-; AVX2-NEXT: vmovdqa %ymm10, 96(%r9)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, 96(%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, 32(%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, 64(%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, (%rsi)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, 96(%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, 64(%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, (%rdx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, 96(%rcx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, 64(%rcx)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, (%rcx)
+; AVX2-NEXT: vmovdqa %ymm10, 96(%r8)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm1, 64(%r8)
+; AVX2-NEXT: vmovdqa %ymm3, (%r8)
+; AVX2-NEXT: vmovdqa %ymm9, 96(%r9)
; AVX2-NEXT: vmovdqa %ymm2, 32(%r9)
-; AVX2-NEXT: vmovdqa %ymm4, (%r9)
-; AVX2-NEXT: vmovdqa %ymm3, 64(%r9)
+; AVX2-NEXT: vmovdqa %ymm5, (%r9)
+; AVX2-NEXT: vmovdqa %ymm4, 64(%r9)
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: vmovdqa %ymm0, 96(%rax)
; AVX2-NEXT: vmovdqa %ymm7, 32(%rax)
-; AVX2-NEXT: vmovdqa %ymm1, 64(%rax)
-; AVX2-NEXT: vmovdqa %ymm11, (%rax)
-; AVX2-NEXT: addq $1272, %rsp # imm = 0x4F8
+; AVX2-NEXT: vmovdqa %ymm6, 64(%rax)
+; AVX2-NEXT: vmovdqa %ymm8, (%rax)
+; AVX2-NEXT: addq $1224, %rsp # imm = 0x4C8
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX2-FP-LABEL: load_i16_stride6_vf64:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: subq $1304, %rsp # imm = 0x518
+; AVX2-FP-NEXT: subq $1272, %rsp # imm = 0x4F8
; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0
; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1
; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm2
@@ -9237,7 +9200,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm6
; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm7
; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3]
-; AVX2-FP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3]
@@ -9251,98 +9214,98 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1]
; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm0
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm0
; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7]
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
-; AVX2-FP-NEXT: vpshufb %ymm2, %ymm4, %ymm7
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm4, %ymm7
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6
-; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm6
-; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm3, %ymm7, %ymm3
+; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3
+; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm7
; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm7, %xmm6
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm3[2],ymm7[3,4],ymm3[5],ymm7[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm3
; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm11
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3],xmm8[4],xmm3[5,6,7]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX2-FP-NEXT: vpshufb %ymm2, %ymm10, %ymm8
-; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6
-; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm10, %ymm8
+; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm3, %ymm8, %ymm3
+; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm8
; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm6
-; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm6, %xmm8
-; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm9
+; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm3
+; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm8
+; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm9
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7]
-; AVX2-FP-NEXT: vpshufb %ymm2, %ymm8, %ymm13
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm8, %ymm13
; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12
; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm13
; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm12
-; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm12, %xmm13
-; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm3
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm13
+; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm2
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,2,2,2,4,5,6,7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7]
-; AVX2-FP-NEXT: vpshufb %ymm2, %ymm13, %ymm2
-; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2
-; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm11, %xmm11
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm13, %ymm6
+; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm14, %ymm6, %ymm6
+; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm11, %xmm11
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm7, %xmm7
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
; AVX2-FP-NEXT: vpshufb %ymm7, %ymm10, %ymm10
; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10
; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7]
; AVX2-FP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm9, %xmm1
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm6, %xmm4
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm4
-; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm1
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6,7]
+; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm3
+; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1
; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpshufb %ymm7, %ymm13, %ymm1
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm12, %xmm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpshufb %xmm6, %xmm12, %xmm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7]
; AVX2-FP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm0
+; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm8 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm0
; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm0
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm8, %xmm0
; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -9364,12 +9327,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm2
+; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm7 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm2
; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm2
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm2
; AVX2-FP-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
; AVX2-FP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
@@ -9382,9 +9345,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm4
; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm3
-; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm4
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm3
+; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm4
; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
@@ -9393,7 +9356,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT: vpblendd $109, (%rsp), %ymm2, %ymm5 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm2
; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
@@ -9447,14 +9410,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm12, %xmm7, %xmm7
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm8, %xmm8
; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
; AVX2-FP-NEXT: # xmm9 = mem[1,1,1,1,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6],xmm9[7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %xmm9, %xmm15, %xmm15
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7]
@@ -9467,17 +9430,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm12, %xmm6, %xmm0
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm7, %xmm0
; AVX2-FP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-FP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FP-NEXT: vpshufb %xmm15, %xmm1, %xmm1
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5]
+; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
@@ -9488,7 +9451,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -9502,7 +9465,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpshufb %xmm15, %xmm13, %xmm0
; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm10, %ymm1
+; AVX2-FP-NEXT: vpshufb %ymm8, %ymm10, %ymm1
; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[1,1,1,1,4,5,6,7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
@@ -9512,44 +9475,9 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
-; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
-; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm0
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3]
-; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm1
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FP-NEXT: vpshufb %ymm11, %ymm3, %ymm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
-; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm14 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
@@ -9558,252 +9486,293 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm0
; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3]
-; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm1
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpshufb %ymm11, %ymm3, %ymm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX2-FP-NEXT: vpshufb %ymm9, %ymm14, %ymm1
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
+; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,2,3]
+; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm2
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FP-NEXT: vpshufb %xmm13, %xmm14, %xmm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1]
-; AVX2-FP-NEXT: vpshufb %xmm12, %xmm8, %xmm0
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
+; AVX2-FP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm7 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm3
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7]
+; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[2,1,0,3]
+; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,2,1]
+; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm0, %xmm2
+; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,6,5,6,4]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpshufb %ymm11, %ymm13, %ymm5
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
+; AVX2-FP-NEXT: vpshufb %ymm9, %ymm7, %ymm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd $219, (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,3,2,1]
+; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,1,2,3]
+; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm3
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm13, %xmm7, %xmm4
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm8 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3]
-; AVX2-FP-NEXT: vpshufb %xmm12, %xmm14, %xmm3
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[2,1,0,3]
; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,1,2,1]
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm15, %xmm2
+; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,6,5,6,4]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-FP-NEXT: vpshufb %ymm9, %ymm8, %ymm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm2
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm3[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0],xmm6[1,2],xmm11[3],xmm6[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm13, %xmm8, %xmm11
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm11[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
-; AVX2-FP-NEXT: vpshufb %xmm10, %xmm5, %xmm0
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3]
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpshufb %ymm11, %ymm15, %ymm3
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,1,0,3]
+; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vpshufb %xmm12, %xmm0, %xmm6
+; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,1,2,1]
+; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,6,5,6,4]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4],xmm6[5,6],xmm4[7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm12 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FP-NEXT: vpshufb %ymm9, %ymm12, %ymm6
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm6 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm6, %xmm9
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,3,2,1]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm9, %xmm5
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1,2],xmm0[3],xmm5[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm13, %xmm12, %xmm5
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm5[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX2-FP-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7]
-; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm9
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm11, %ymm11
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm0
+; AVX2-FP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX2-FP-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5,6],xmm4[7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm14, %ymm5
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX2-FP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX2-FP-NEXT: # xmm11 = mem[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm1[1,2],xmm11[3],xmm1[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm14
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3,4],xmm14[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm8, %xmm0
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7]
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm13, %ymm4
+; AVX2-FP-NEXT: vpshufb %xmm13, %xmm15, %xmm0
+; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm8, %ymm10
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm8, %xmm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm0, %xmm1
-; AVX2-FP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX2-FP-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7]
+; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm0
+; AVX2-FP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX2-FP-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6],xmm2[7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm7, %ymm2
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-FP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX2-FP-NEXT: # xmm3 = mem[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm7, %xmm3
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm4
-; AVX2-FP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX2-FP-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm6
-; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm2, %xmm14, %xmm1
-; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX2-FP-NEXT: vpshufb %ymm7, %ymm15, %ymm2
-; AVX2-FP-NEXT: vpshufb %xmm3, %xmm5, %xmm3
-; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm13, %xmm0, %xmm0
+; AVX2-FP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX2-FP-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,5]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6],xmm2[7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm12, %ymm2
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm1, %xmm12, %xmm1
+; AVX2-FP-NEXT: vpshufb %xmm5, %xmm9, %xmm2
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[3,1,2,1,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm1
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm4
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7]
+; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm7
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm1
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm2
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm11
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm11, %xmm1
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm5
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm6
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm2
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm5
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5],xmm2[6,7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6],ymm5[7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm11
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,3,2,1]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm11, %xmm5
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm9
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2,3],xmm5[4],xmm9[5],xmm5[6,7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm9 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6],ymm9[7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm13
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,3,2,1]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm13, %xmm9
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm15, %xmm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6,7]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm9 = mem[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm6, %xmm6
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5],xmm6[6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm13
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm1
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm10
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7]
+; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm1
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm8, %xmm7
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4],xmm7[5],xmm1[6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm15
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm15, %xmm10
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm11, %xmm7
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm12, %xmm8
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm11, %xmm11
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm12, %xmm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm13, %xmm6
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm14, %xmm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-FP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm15, %xmm12
-; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7]
+; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FP-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm13, %xmm8
+; AVX2-FP-NEXT: vpshufb %xmm0, %xmm15, %xmm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm8[4],xmm0[5],xmm8[6,7]
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, (%rsi)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rdx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rdx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rdx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, (%rdx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rcx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rcx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rcx)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, (%rcx)
-; AVX2-FP-NEXT: vmovdqa %ymm8, 96(%r8)
-; AVX2-FP-NEXT: vmovdqa %ymm9, 32(%r8)
-; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FP-NEXT: vmovaps %ymm7, 64(%r8)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm8, 96(%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm8, 32(%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm8, 64(%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm8, (%rsi)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm8, 96(%rdx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm8, 32(%rdx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm8, 64(%rdx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm8, (%rdx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm8, 32(%rcx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm8, 96(%rcx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm8, 64(%rcx)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm8, (%rcx)
+; AVX2-FP-NEXT: vmovdqa %ymm10, 96(%r8)
+; AVX2-FP-NEXT: vmovdqa %ymm14, 32(%r8)
+; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FP-NEXT: vmovaps %ymm8, 64(%r8)
; AVX2-FP-NEXT: vmovdqa %ymm3, (%r8)
-; AVX2-FP-NEXT: vmovdqa %ymm10, 96(%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9)
+; AVX2-FP-NEXT: vmovdqa %ymm9, 96(%r9)
+; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%r9)
+; AVX2-FP-NEXT: vmovdqa %ymm2, (%r9)
; AVX2-FP-NEXT: vmovdqa %ymm4, 64(%r9)
; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax)
-; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%rax)
-; AVX2-FP-NEXT: vmovdqa %ymm2, 64(%rax)
-; AVX2-FP-NEXT: vmovdqa %ymm11, (%rax)
-; AVX2-FP-NEXT: addq $1304, %rsp # imm = 0x518
+; AVX2-FP-NEXT: vmovdqa %ymm7, 32(%rax)
+; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rax)
+; AVX2-FP-NEXT: vmovdqa %ymm6, (%rax)
+; AVX2-FP-NEXT: addq $1272, %rsp # imm = 0x4F8
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
@@ -9819,16 +9788,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm8
; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm9
-; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm6
; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm7
; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm10 = ymm7[2,3],ymm6[2,3]
; AVX2-FCP-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill
; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm6[0,1]
; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[2,3],ymm4[2,3]
+; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[2,3],ymm4[2,3]
+; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm13 = ymm5[0,1],ymm4[0,1]
; AVX2-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm15 = ymm5[0,1],ymm4[0,1]
; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3]
; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm3[0,1],ymm2[0,1]
@@ -9837,43 +9806,43 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm14 = ymm0[0,1],ymm1[0,1]
; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm0
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
-; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm7
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm0
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm7[1],ymm10[2,3,4,5],ymm7[6],ymm10[7]
+; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm7
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm7, %ymm6
-; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm6
-; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
+; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm5, %ymm7, %ymm5
+; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm7
; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm6
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2],ymm5[3,4],ymm7[5],ymm5[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm7
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm11
; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,2,2,2,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3],xmm8[4],xmm6[5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3],xmm8[4],xmm7[5,6,7]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm14[1],ymm12[2,3,4,5],ymm14[6],ymm12[7]
-; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm8
-; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm6
-; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm8
+; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7
+; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm8
; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm6
-; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2],ymm8[3,4],ymm6[5],ymm8[6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm8
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm9
+; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm7
+; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm8
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9
; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[2,2,2,2,4,5,6,7]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm8[0],xmm12[1],xmm8[2,3],xmm12[4],xmm8[5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0],ymm15[1],ymm13[2,3,4,5],ymm15[6],ymm13[7]
-; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm13
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm13[1],ymm15[2,3,4,5],ymm13[6],ymm15[7]
+; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm13
; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12
; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm13
@@ -9881,67 +9850,67 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm12
; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm13
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,2,2,2,4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm13
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm6
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm14 = xmm6[2,2,2,2,4,5,6,7]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0],xmm14[1],xmm13[2,3],xmm14[4],xmm13[5,6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm13 = mem[0],ymm13[1],mem[2,3,4,5],ymm13[6],mem[7]
-; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2
-; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm2, %ymm2
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm11
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm7
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm7[0],xmm11[1],xmm7[2,3],xmm11[4],xmm7[5,6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm10
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm13 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7]
+; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm3
+; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm14, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm11
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm10
; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm10
; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm1
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm4
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6,7]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm4
-; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm1
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm2
+; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm1
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm1
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm2
+; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6,7]
; AVX2-FCP-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [6,5,4,7]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm7 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm0
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
-; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm8 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
+; AVX2-FCP-NEXT: vpermd %ymm8, %ymm12, %ymm0
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm0
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7]
; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm0
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm3
; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm3
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm4
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm3
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm4
; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
@@ -9950,27 +9919,26 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm6 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm2
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
-; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
-; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = ymm15[0],mem[1],ymm15[2,3,4,5],mem[6],ymm15[7]
+; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm7 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
+; AVX2-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm2
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4,5],ymm3[6],mem[7]
; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
-; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm15
+; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm3
; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm4
; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm3
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm3
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm4
; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4,5],xmm3[6],xmm4[7]
@@ -9981,17 +9949,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm5 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm2
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
-; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7]
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm2
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
+; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm15[1],mem[2,3,4,5],ymm15[6],mem[7]
; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
+; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm15
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm3
; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10012,70 +9979,69 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm3
; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm9
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13
-; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0,1,2],xmm9[3],xmm0[4,5],xmm9[6],xmm0[7]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm10
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm14
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2],xmm10[3],xmm0[4,5],xmm10[6],xmm0[7]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm2 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm10
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm10[2,1,0,3]
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm12, %ymm10
; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm12
-; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm1
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm1[2],xmm12[3],xmm1[4,5],xmm12[6,7]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
-; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm10, %ymm15
+; AVX2-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm9 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
+; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm15
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2],ymm9[3,4,5,6,7],ymm12[8,9,10],ymm9[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2],ymm11[3,4,5,6,7],ymm12[8,9,10],ymm11[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [6,7,2,3,12,13,14,15,6,7,2,3,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7
-; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm9 = mem[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm9[2],xmm7[3,4],xmm9[5],xmm7[6],xmm9[7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm12 = mem[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [6,7,2,3,12,13,14,15,8,9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0,1],xmm12[2],xmm8[3,4],xmm12[5],xmm8[6],xmm12[7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm15
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,6,7,4,5,0,1,10,11,14,15,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm15, %xmm15
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm11[3],xmm1[4,5],xmm11[6],xmm1[7]
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm13[3],xmm1[4,5],xmm13[6],xmm1[7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm0
-; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6],xmm0[7]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm8[0,1,2,3,5,5,5,5]
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3],xmm1[4,5],xmm6[6],xmm1[7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm0
-; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm1 = mem[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-FCP-NEXT: vpshuflw $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6],xmm0[7]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1
@@ -10085,310 +10051,317 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm0
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm14, %xmm0
; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,5,5,5]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm1
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[1,1,1,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7]
-; AVX2-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm1
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[1,1,1,1,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7]
+; AVX2-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm2
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
-; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,5,6,5]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm1
+; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm6
+; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,1,0,3]
; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm0
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[0,3,2,1]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[2,1,2,3]
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm1
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6],xmm0[7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,1,0,3]
-; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
-; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm0
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm1
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [4,7,6,5]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm2
+; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm4
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,2,1]
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm0
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,5,6,4]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,3,2,1]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm3
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2],xmm5[3],xmm3[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm5 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm0
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,1,0,3]
+; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm5
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7],ymm5[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm0
+; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm8
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,1,2,3]
+; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm3
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm4
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm9 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm1
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm6
; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,1,0,3]
-; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm3
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,2,1]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm3
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm15 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm3 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX2-FCP-NEXT: vpermd %ymm3, %ymm8, %ymm2
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm8
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm3[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1,2],xmm10[3],xmm8[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm10
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4],xmm10[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,3,2,1]
-; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm0
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[2,1,2,3]
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,2,0,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm3
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,4]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm4 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm0
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm3 = mem[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6],xmm3[7]
-; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm9
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm11
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,3,2]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm11[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,1,0,3]
+; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm4
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,6,5,6,4]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6],xmm13[7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm12
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,7,6,5]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm12
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm7
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,2,3]
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,1,2,0,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm7[1,2],xmm0[3],xmm7[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm7
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm7[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm0
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5,6],xmm6[7]
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm4
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2],xmm1[3],xmm4[4,5,6,7]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm4
+; AVX2-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm4
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6],xmm0[7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm15 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm4
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,3,2]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm4[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm1
-; AVX2-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm4 = mem[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5,6],xmm4[7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX2-FCP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm10 = mem[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm1[1,2],xmm10[3],xmm1[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm11
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm11[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm6
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4],xmm6[5,6],xmm0[7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm6
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm0 = mem[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6],xmm0[7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-FCP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm3 = mem[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4
-; AVX2-FCP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm6 = mem[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm6
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm6[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm14, %xmm1
-; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,5,6,5]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
-; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm2
-; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3
-; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,1,4,5,6,7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2],xmm4[3],xmm3[4,5,6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm2 = mem[0,1,2,3,7,5,6,5]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5,6],xmm2[7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpshufb %ymm15, %ymm13, %ymm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm1
+; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm2
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,1,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,3,2,1]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm1
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm4
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7]
+; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,3,2,1]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm1
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm2
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6,7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm11
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,3,2,1]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm1
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm5
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4],xmm5[5],xmm1[6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm13
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm1[0,3,2,1]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm1
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm10
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm15
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[0,3,2,1]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm10
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm7
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm10[4],xmm7[5],xmm10[6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm10 = mem[0,1,2,3,4],ymm7[5,6,7]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm11
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm12
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm11 = mem[0,1,2,3,4],ymm11[5,6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3],xmm2[4],xmm6[5],xmm2[6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6],ymm1[7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm2
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm5
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4],xmm5[5],xmm2[6,7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm6
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm12
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm6[5,6,7]
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm12
-; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm12[4],xmm0[5],xmm12[6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm5 = mem[0],ymm5[1],mem[2,3],ymm5[4],mem[5,6],ymm5[7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm10
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,3,2,1]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm5
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm9
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0,1,2,3],xmm5[4],xmm9[5],xmm5[6,7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,4],ymm5[5,6,7]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm9 = mem[0],ymm9[1],mem[2,3],ymm9[4],mem[5,6],ymm9[7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,3,2,1]
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm9
+; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm0
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6,7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm9 = mem[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm0
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3,4],ymm0[5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm0
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5],xmm0[6,7]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, (%rsi)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rdx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rdx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rdx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, (%rdx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rcx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, 96(%rcx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%rcx)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, (%rcx)
-; AVX2-FCP-NEXT: vmovdqa %ymm8, 96(%r8)
-; AVX2-FCP-NEXT: vmovdqa %ymm9, 32(%r8)
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm7, 64(%r8)
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm1
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm7
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4],xmm7[5],xmm1[6,7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4],ymm1[5,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm7
+; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm14, %xmm8
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5],xmm7[6,7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX2-FCP-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm7 = mem[0,1,2,3,4],ymm7[5,6,7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%rsi)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, 32(%rsi)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, 64(%rsi)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, (%rsi)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%rdx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, 32(%rdx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, 64(%rdx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, (%rdx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, 32(%rcx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%rcx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, 64(%rcx)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, (%rcx)
+; AVX2-FCP-NEXT: vmovdqa %ymm11, 96(%r8)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, 32(%r8)
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm8, 64(%r8)
; AVX2-FCP-NEXT: vmovdqa %ymm3, (%r8)
-; AVX2-FCP-NEXT: vmovdqa %ymm10, 96(%r9)
-; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r9)
-; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9)
+; AVX2-FCP-NEXT: vmovdqa %ymm9, 96(%r9)
+; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r9)
+; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r9)
; AVX2-FCP-NEXT: vmovdqa %ymm4, 64(%r9)
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm11, (%rax)
+; AVX2-FCP-NEXT: vmovdqa %ymm7, 96(%rax)
+; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rax)
+; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
+; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rax)
; AVX2-FCP-NEXT: addq $1304, %rsp # imm = 0x518
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
@@ -10396,7 +10369,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-LABEL: load_i16_stride6_vf64:
; AVX512: # %bb.0:
; AVX512-NEXT: subq $1480, %rsp # imm = 0x5C8
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15]
; AVX512-NEXT: vmovdqa 608(%rdi), %ymm0
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vmovdqa 576(%rdi), %ymm1
@@ -10408,17 +10381,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm20[0,2,0,3]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
; AVX512-NEXT: vmovdqa 544(%rdi), %ymm1
; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vmovdqa 512(%rdi), %ymm2
; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm2
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm21
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
-; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm1
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21
+; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm2
+; AVX512-NEXT: vmovdqa64 %xmm3, %xmm22
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10427,10 +10401,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vmovdqa 384(%rdi), %ymm2
; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX512-NEXT: vpshufb %xmm9, %xmm12, %xmm1
-; AVX512-NEXT: vextracti32x4 $1, %ymm12, %xmm22
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm22[0,2,0,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
+; AVX512-NEXT: vpshufb %xmm9, %xmm14, %xmm1
+; AVX512-NEXT: vextracti32x4 $1, %ymm14, %xmm23
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm23[0,2,0,3]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3]
@@ -10438,55 +10412,56 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vinserti128 $1, 480(%rdi), %ymm0, %ymm0
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512-NEXT: vpshufb %ymm5, %ymm2, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm28
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 640(%rdi), %ymm0
-; AVX512-NEXT: vmovdqa 736(%rdi), %ymm1
+; AVX512-NEXT: vmovdqa 736(%rdi), %ymm0
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa 704(%rdi), %ymm1
; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa 704(%rdi), %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm2
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,2,2,2,4,5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm28
-; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512-NEXT: vpshufb %xmm10, %xmm13, %xmm0
+; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm2
+; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm2, %xmm29
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
+; AVX512-NEXT: vmovdqa 640(%rdi), %ymm1
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vinserti128 $1, 672(%rdi), %ymm0, %ymm0
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm17
-; AVX512-NEXT: vmovdqa64 %ymm2, %ymm29
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1
+; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm2 = [16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0]
+; AVX512-NEXT: vpshufb %ymm2, %ymm12, %ymm1
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm17
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm2, %xmm18
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vmovdqa 224(%rdi), %ymm0
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vmovdqa 192(%rdi), %ymm1
; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,2,0,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm8
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,2,0,3]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512-NEXT: vpshufb %xmm9, %xmm14, %xmm1
+; AVX512-NEXT: vpshufb %xmm9, %xmm15, %xmm1
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
; AVX512-NEXT: vmovdqa 160(%rdi), %ymm1
; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2
; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
; AVX512-NEXT: vmovdqa64 %ymm1, %ymm30
-; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm7
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3,4],xmm7[5,6,7]
-; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm10, %xmm5, %xmm1
+; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm7
+; AVX512-NEXT: vpshufb %xmm10, %xmm7, %xmm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -10496,99 +10471,101 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm0
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm9
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
-; AVX512-NEXT: vmovdqa 64(%rdi), %ymm2
-; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
-; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,2,0,3]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
-; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX512-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1
+; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
+; AVX512-NEXT: vpshufb %ymm4, %ymm6, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vmovdqa 352(%rdi), %ymm0
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vmovdqa 320(%rdi), %ymm1
; AVX512-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3,4],xmm6[5,6,7]
-; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm10
-; AVX512-NEXT: vmovdqa 256(%rdi), %ymm2
-; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm0
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512-NEXT: vpshufb %xmm10, %xmm4, %xmm10
+; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm0[3],xmm10[4,5],xmm0[6],xmm10[7]
+; AVX512-NEXT: vmovdqa 256(%rdi), %ymm0
+; AVX512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3]
; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
+; AVX512-NEXT: vinserti128 $1, 288(%rdi), %ymm0, %ymm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm31
; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0
-; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: vpshufb %ymm0, %ymm9, %ymm0
; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
+; AVX512-NEXT: vmovdqa64 %xmm18, %xmm1
+; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm10
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
-; AVX512-NEXT: vpshufb %xmm10, %xmm15, %xmm0
-; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm14
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1],xmm0[2],xmm14[3],xmm0[4,5],xmm14[6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512-NEXT: vpshufb %xmm14, %xmm7, %xmm7
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufb %xmm10, %xmm9, %xmm0
+; AVX512-NEXT: vpshufb %xmm10, %xmm8, %xmm0
+; AVX512-NEXT: vpshufb %xmm10, %xmm15, %xmm8
+; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm0[2],xmm8[3],xmm0[4,5],xmm8[6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm7
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufb %xmm10, %xmm11, %xmm5
; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512-NEXT: vpshufb %ymm3, %ymm8, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufb %xmm14, %xmm6, %xmm0
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
-; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm0, %ymm27
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm0
-; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm0
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512-NEXT: vpshufb %ymm5, %ymm6, %ymm6
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7]
+; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm3
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0]
+; AVX512-NEXT: vpshufb %ymm3, %ymm9, %ymm4
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
+; AVX512-NEXT: vpshufb %xmm4, %xmm9, %xmm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm27
+; AVX512-NEXT: vmovdqa64 %xmm20, %xmm1
+; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm1
; AVX512-NEXT: vmovdqa64 %ymm16, %ymm2
; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm2
-; AVX512-NEXT: vpshufb %xmm14, %xmm2, %xmm2
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,5,5,5]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm23, %ymm0
-; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2
+; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX512-NEXT: vmovdqa64 %ymm21, %ymm6
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3],xmm2[4,5],xmm6[6],xmm2[7]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
+; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 %ymm28, %ymm1
+; AVX512-NEXT: vpshufb %ymm5, %ymm1, %ymm1
+; AVX512-NEXT: vmovdqa64 %xmm23, %xmm2
; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm2
-; AVX512-NEXT: vpshufb %xmm10, %xmm12, %xmm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm29, %ymm0
-; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa64 %xmm28, %xmm1
-; AVX512-NEXT: vpshufb %xmm14, %xmm1, %xmm1
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,5,5,5,5]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm5
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3],xmm2[4,5],xmm5[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 %xmm29, %xmm1
+; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,5,5,5]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
+; AVX512-NEXT: vpshufb %ymm3, %ymm12, %ymm1
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm26
; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
@@ -10640,21 +10617,23 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd $36, (%rsp), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX512-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,0,0,0,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,1,2,1]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,6,4]
; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vmovdqa64 %ymm31, %ymm4
-; AVX512-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm13 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512-NEXT: vpshufb %ymm5, %ymm13, %ymm4
+; AVX512-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm8 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7]
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm5 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm4
; AVX512-NEXT: vmovdqa64 %ymm5, %ymm24
; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512-NEXT: vpshufb %xmm5, %xmm8, %xmm4
+; AVX512-NEXT: vmovdqa64 %xmm5, %xmm25
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
@@ -10666,21 +10645,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[2,1,2,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,3,2,1]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,3,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,2,0,4,5,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX512-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,0,0,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,1,2,1]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,6,5,6,4]
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm23
@@ -10699,109 +10678,111 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
; AVX512-NEXT: vpshufb %ymm0, %ymm5, %ymm0
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4]
; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7]
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX512-NEXT: vmovdqa64 %ymm24, %ymm0
-; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7]
+; AVX512-NEXT: vmovdqa64 %ymm24, %ymm1
+; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1
; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm23 ^ (zmm29 & (zmm2 ^ zmm23))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm22 & (zmm0 ^ zmm2))
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15]
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm14
+; AVX512-NEXT: vpshufb %xmm14, %xmm2, %xmm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm23 ^ (zmm29 & (zmm0 ^ zmm23))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0))
+; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vmovdqa64 %xmm21, %xmm0
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm28, %xmm2
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm19, %xmm2
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
+; AVX512-NEXT: vmovdqa64 %xmm28, %xmm1
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm1
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
; AVX512-NEXT: vmovdqa64 %xmm20, %xmm14
; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm28
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3],xmm1[4],xmm14[5,6],xmm1[7]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm28
; AVX512-NEXT: vmovdqa64 %xmm16, %xmm0
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,1,4,5,6,7]
; AVX512-NEXT: vmovdqa64 %xmm18, %xmm0
; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1,2],xmm1[3],xmm14[4,5,6,7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0
; AVX512-NEXT: vpshufb %ymm14, %ymm0, %ymm0
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512-NEXT: vpshufb %ymm10, %ymm13, %ymm13
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm13
-; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm13
-; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm25
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,5,6,5]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2,3],xmm1[4],xmm11[5,6],xmm1[7]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm11 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX512-NEXT: vpshufb %ymm11, %ymm8, %ymm12
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512-NEXT: vpshufb %xmm12, %xmm8, %xmm8
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
+; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
+; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm8
+; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm8
+; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25
; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm28 ^ (zmm29 & (zmm0 ^ zmm28))
; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm22 & (zmm25 ^ zmm0))
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[1,1,1,1,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[3,1,2,1,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,3,4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,7,5,6,5]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[1,1,1,1,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4],xmm8[5,6],xmm1[7]
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,1,2,1,4,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2],xmm1[3],xmm6[4,5,6,7]
; AVX512-NEXT: vpshufb %ymm14, %ymm5, %ymm5
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512-NEXT: vpshufb %ymm11, %ymm2, %ymm4
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm29 & (zmm2 ^ zmm0))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm22 & (zmm28 ^ zmm2))
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm12, %xmm2, %xmm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm28
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm29 & (zmm1 ^ zmm0))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm22 & (zmm28 ^ zmm1))
; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX512-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm0
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7]
@@ -10815,7 +10796,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,0,2,4,5,6,7]
; AVX512-NEXT: vmovdqa64 %xmm4, %xmm22
; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,6,6,6]
-; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm13 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13]
; AVX512-NEXT: vpshufb %xmm13, %xmm5, %xmm4
; AVX512-NEXT: vmovdqa64 %xmm5, %xmm20
; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6,7]
@@ -10910,12 +10891,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm1[5,6,7]
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm16
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15]
; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm24[1,1,2,3]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
-; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15]
; AVX512-NEXT: vmovdqa64 %xmm20, %xmm0
; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm3
; AVX512-NEXT: vmovdqa64 %xmm22, %xmm0
@@ -11022,30 +11003,31 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-FCP-LABEL: load_i16_stride6_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $1416, %rsp # imm = 0x588
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512-FCP-NEXT: subq $1800, %rsp # imm = 0x708
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [6,5,4,7]
; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm0
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm21
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512-FCP-NEXT: vpermd %ymm2, %ymm16, %ymm1
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm21
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
; AVX512-FCP-NEXT: vmovdqa 544(%rdi), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm2
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm1
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm2
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm22
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm1
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm22
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
@@ -11055,31 +11037,30 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm2
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm1
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm2
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,1,0,3]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm2
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3],xmm2[4,5],xmm1[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
+; AVX512-FCP-NEXT: vpermd %ymm15, %ymm16, %ymm2
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm24
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],mem[2,3]
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
; AVX512-FCP-NEXT: vinserti128 $1, 480(%rdi), %ymm0, %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3,4,5],ymm2[6],ymm0[7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm24
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm0
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm0
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm1
; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm26
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm1
@@ -11088,33 +11069,33 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm16
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm29
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0]
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm17
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm18
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm14, %xmm1
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[2,1,0,3]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm15, %xmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX512-FCP-NEXT: vpermd %ymm11, %ymm16, %ymm8
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm1
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm30
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm1
; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm2
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
@@ -11124,516 +11105,511 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm2
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm2
-; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
-; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm0
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm16, %ymm12
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm0
-; AVX512-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7]
-; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm2
-; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
-; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm31
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm0
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm0[3],xmm9[4,5],xmm0[6],xmm9[7]
+; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm0
+; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3]
+; AVX512-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm0, %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm14
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2],xmm0[3],xmm14[4,5],xmm0[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm31
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm0
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7],ymm0[8,9,10],ymm9[11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm0[2],xmm8[3],xmm0[4,5],xmm8[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2],xmm0[3],xmm3[4,5],xmm0[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm0
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28
-; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,5,5,5,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm3
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm2
-; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm2
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm6
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3],xmm2[4,5],xmm6[6],xmm2[7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm2
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3],xmm2[4,5],xmm5[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm1
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,5,5,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,5,5,5,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm1
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [4,7,6,5]
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
+; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX512-FCP-NEXT: vpermd %ymm0, %ymm27, %ymm15
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm28 = [10,1,10,3]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1
+; AVX512-FCP-NEXT: vpermd 192(%rdi), %zmm28, %zmm2
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2
-; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm2
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm21
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm24
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm22 = [2,9,8,11]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm23 = [12,5,14,5]
+; AVX512-FCP-NEXT: vpermd %zmm24, %zmm22, %zmm2
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vpermd %zmm24, %zmm23, %zmm3
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,4]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm3
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm19
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm18
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm5
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm17
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm27, %ymm4
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm19
+; AVX512-FCP-NEXT: vpermd (%rdi), %zmm28, %zmm4
+; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm18
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $219, (%rsp), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm5
-; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm16
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,2,1]
+; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm25
+; AVX512-FCP-NEXT: vpermd %zmm25, %zmm22, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vpermd %zmm25, %zmm23, %zmm14
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,6,5,6,4]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm6
-; AVX512-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm15 = ymm6[0,1],mem[2],ymm6[3],mem[4],ymm6[5,6],mem[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm7
+; AVX512-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm8 = mem[0,1],ymm7[2],mem[3],ymm7[4],mem[5,6],ymm7[7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm6
-; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm6
+; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm30
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm6
+; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm21
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm29 & (zmm4 ^ zmm3))
-; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm26 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm26 & (zmm5 ^ zmm4))
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm20 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm20 & (zmm5 ^ zmm4))
; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,3,2,1]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,2,3]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm27, %ymm3
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm17
+; AVX512-FCP-NEXT: vpermd 576(%rdi), %zmm28, %zmm11
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,1,2,0,4,5,6,7]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,0,3]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,1,2,1]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm8[0,1,2,3,6,5,6,4]
+; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm26
+; AVX512-FCP-NEXT: vpermd %zmm26, %zmm22, %zmm10
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm3
+; AVX512-FCP-NEXT: vpermd %zmm26, %zmm23, %zmm9
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,6,5,6,4]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm11
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm16
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,3,2,1]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm2
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm6[2,1,2,0,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1
+; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX512-FCP-NEXT: vpermd %ymm0, %ymm27, %ymm7
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm0
+; AVX512-FCP-NEXT: vpermd 384(%rdi), %zmm28, %zmm6
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm5 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7]
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm1
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,1,0,3]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm0
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
+; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm28
+; AVX512-FCP-NEXT: vpermd %zmm28, %zmm22, %zmm4
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm0
+; AVX512-FCP-NEXT: vpermd %zmm28, %zmm23, %zmm3
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,6,5,6,4]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm12
; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0,1,2],ymm13[3,4,5,6,7],ymm0[8,9,10],ymm13[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm11 ^ (zmm29 & (zmm1 ^ zmm11))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm26 & (zmm25 ^ zmm1))
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm22
-; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm1
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm11
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm11
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm13
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5,6],xmm13[7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm11, %zmm24
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm13
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm16 ^ (zmm29 & (zmm1 ^ zmm16))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm20 & (zmm0 ^ zmm1))
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm21
+; AVX512-FCP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm12 = mem[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm1[1,2],xmm12[3],xmm1[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm13
+; AVX512-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm15 = mem[0,1,2,3,7,5,6,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5,6],xmm15[7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm13, %zmm16
; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm11
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm12
; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm19
-; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm0
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm0[3,1,2,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm11[1,2],xmm13[3],xmm11[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm0[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm13
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm13
+; AVX512-FCP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm15 = mem[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm15
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6],xmm14[7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm15
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm15
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7],ymm15[8,9,10],ymm13[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,4,5]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm23
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm24 ^ (zmm29 & (zmm0 ^ zmm24))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm26 & (zmm23 ^ zmm0))
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm10
-; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm10
-; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[3,1,2,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1,2],xmm10[3],xmm0[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm9
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4],xmm9[5,6],xmm8[7]
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm13
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7],ymm13[8,9,10],ymm15[11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm23
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm16 ^ (zmm29 & (zmm0 ^ zmm16))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm20 & (zmm23 ^ zmm0))
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm8
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[3,1,2,1,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1,2],xmm8[3],xmm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm8
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5,6],xmm9[7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0
; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm8
; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm3
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm0 ^ (zmm29 & (zmm5 ^ zmm0))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm26 & (zmm28 ^ zmm5))
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm11 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm0
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2
-; AVX512-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm22
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm3
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm21
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm30
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm20 & (zmm27 ^ zmm5))
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm0
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm29 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm29) | ymm3
-; AVX512-FCP-NEXT: movw $31, %ax
-; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm30 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm2
-; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
+; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm15 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm2
+; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6,7]
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $146, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5,6],mem[7]
+; AVX512-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,3,2,1]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm20
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm3
-; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm19
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm29 = [0,3,2,9]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm22
+; AVX512-FCP-NEXT: vpermd %zmm24, %zmm29, %zmm3
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm16
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm18
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm15 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm16
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm5 = ymm1[0],mem[1],ymm1[2,3,4,5],mem[6],ymm1[7]
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm12
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm30 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm4
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm18
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm30) | ymm4
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: movw $31, %ax
+; AVX512-FCP-NEXT: kmovw %eax, %k1
+; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm16 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm4
+; AVX512-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5],mem[6],ymm4[7]
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm3
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
+; AVX512-FCP-NEXT: vpermd %zmm25, %zmm29, %zmm4
+; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5],xmm3[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm4
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm24
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm19
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm21
+; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5],mem[6],ymm3[7]
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm13 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm3
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,2,2,2,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm25 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm17
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm30) | ymm2
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm3
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm13
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm29) | ymm0
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm0
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[2,2,2,2,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,3,2,1]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm4
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3],xmm4[4],xmm0[5,6,7]
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm4 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm4
+; AVX512-FCP-NEXT: vpermd %zmm26, %zmm29, %zmm9
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm5
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5],xmm4[6,7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm31
-; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm31 {%k1}
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm8
+; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm8 {%k1}
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm4
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm12
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm12
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm4[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm16
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm14
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm10[0],xmm14[1],xmm10[2,3],xmm14[4],xmm10[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm10
-; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm17
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm29) | ymm10
-; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm10
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm11
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1],xmm11[2,3],xmm10[4],xmm11[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX512-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm31 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512-FCP-NEXT: vpermd %zmm28, %zmm29, %zmm3
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4],xmm14[5],xmm0[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm14
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm12
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0],xmm12[1],xmm1[2,3],xmm12[4],xmm1[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm18
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm30) | ymm1
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3],xmm1[4],xmm14[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm12
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm12
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm15
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vmovdqa32 %zmm14, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm11
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm11
-; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm12
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm12
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4],xmm12[5],xmm11[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm0 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm12
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm12
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm14, %xmm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5],xmm12[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm15
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm19
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm15
; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm14
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm14
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm14
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm13
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3],xmm13[4],xmm6[5,6,7]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = (ymm6 & ymm29) | ymm14
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm7
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm15
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm11
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm13
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3],xmm11[4],xmm13[5,6,7]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = (ymm11 & ymm30) | ymm15
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm7[1],xmm2[2,3],xmm7[4],xmm2[5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm4
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm7
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm8, %zmm2
-; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm2 {%k1}
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm6
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm7
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm10
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm9, %zmm2
+; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm2 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm9
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6,7]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
; AVX512-FCP-NEXT: movw $-2048, %ax # imm = 0xF800
; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rsi)
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rsi)
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm5 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx)
+; AVX512-FCP-NEXT: vmovdqa32 %zmm25, %zmm4 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm5 = mem ^ (zmm3 & (zmm5 ^ mem))
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm26 & (zmm4 ^ zmm30))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm26 & (zmm7 ^ zmm31))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm26 & (zmm11 ^ zmm0))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm26 & (zmm1 ^ zmm2))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%rcx)
+; AVX512-FCP-NEXT: vmovdqa32 %zmm5, %zmm4 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rsi)
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa32 %zmm31, %zmm4 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rdx)
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm3 & (zmm4 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa32 %zmm26, %zmm4 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdx)
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm3 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm3 & (zmm6 ^ zmm16))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm8))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm3 & (zmm12 ^ zmm0))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm2))
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rcx)
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm0, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 64(%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 64(%r8)
; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%r9)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r9)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax)
-; AVX512-FCP-NEXT: addq $1416, %rsp # imm = 0x588
+; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%rax)
+; AVX512-FCP-NEXT: addq $1800, %rsp # imm = 0x708
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride6_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: subq $840, %rsp # imm = 0x348
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,12,13,4,5,8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vmovdqa 608(%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm1
@@ -11645,18 +11621,18 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm24[0,2,0,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
; AVX512DQ-NEXT: vmovdqa 544(%rdi), %ymm1
; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm2
; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm1
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[2,2,2,2,4,5,6,7]
; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm23
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4],xmm3[5,6,7]
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm3, %xmm2
; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm22
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,0,1,12,13,8,9,4,5]
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm1
@@ -11664,10 +11640,11 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm3
; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm15, %xmm2
-; AVX512DQ-NEXT: vextracti32x4 $1, %ymm15, %xmm21
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm21[0,2,0,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
+; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm2
+; AVX512DQ-NEXT: vextracti32x4 $1, %ymm3, %xmm20
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm21
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm20[0,2,0,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3]
@@ -11675,34 +11652,35 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1
; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
-; AVX512DQ-NEXT: vpshufb %ymm5, %ymm3, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm20
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm19
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm1
-; AVX512DQ-NEXT: vmovdqa 736(%rdi), %ymm2
+; AVX512DQ-NEXT: vmovdqa 736(%rdi), %ymm1
+; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm2
; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm4
-; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm4
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3,4],xmm4[5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm19
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm1[2,3],mem[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm15, %xmm1
+; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm4, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm18
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
+; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],mem[2,3]
; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vinserti128 $1, 672(%rdi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3,4,5],ymm4[6],ymm1[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm27
-; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm18
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,6]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, 672(%rdi), %ymm2, %ymm2
+; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm13, %ymm2
+; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm27
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm13, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm28
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm16 & (zmm3 ^ zmm0))
; AVX512DQ-NEXT: movw $-2048, %ax # imm = 0xF800
@@ -11713,21 +11691,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm1
; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm8
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,2,0,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm13, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm9, %xmm14, %xmm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm1
; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2
; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm11
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3,4],xmm11[5,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm5, %xmm1
+; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm12
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm12, %xmm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
@@ -11736,102 +11714,104 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm9
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3],xmm2[4,5],xmm0[6,7]
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm2
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
-; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm31
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm30
-; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm11
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,2,0,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
+; AVX512DQ-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm31
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm30
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm6, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm1
; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm8
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,2,2,2,4,5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3,4],xmm8[5,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm10
-; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm2
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
-; AVX512DQ-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm0
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm4, %xmm10
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm0[3],xmm10[4,5],xmm0[6],xmm10[7]
+; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm0
+; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3]
+; AVX512DQ-NEXT: vinserti128 $1, 288(%rdi), %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm26
; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm29
; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm0
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm9, %ymm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm10
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm17 ^ (zmm16 & (zmm7 ^ zmm17))
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,10,11,6,7,2,3,14,15,12,13,14,15]
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm14, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm13
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm13, %xmm11, %xmm7
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4,5],xmm4[6],xmm7[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm4, %zmm0
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm9, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm8, %xmm0
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm14, %xmm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm0[2],xmm8[3],xmm0[4,5],xmm8[6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm7
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm11, %xmm7
; AVX512DQ-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3],xmm4[4,5],xmm3[6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm6, %ymm6
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3],xmm7[4,5],xmm3[6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm6, %ymm6
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufb %xmm13, %xmm8, %xmm3
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm16 & (zmm6 ^ zmm0))
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm3
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0]
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm9, %ymm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm9, %xmm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm16 & (zmm6 ^ zmm5))
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1}
; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1
; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm1
-; AVX512DQ-NEXT: vpshufb %xmm13, %xmm1, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm2
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm2
; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm15, %xmm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3],xmm2[4,5],xmm4[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm2
-; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm2
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm5
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5],xmm5[6],xmm2[7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm16 & (zmm4 ^ zmm0))
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1}
-; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm2
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm2, %ymm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm5, %xmm5
+; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm6
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm2
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm0
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,5,5,5,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm13, %ymm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm13, %xmm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm16 & (zmm5 ^ zmm1))
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1}
+; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512DQ-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
@@ -11882,21 +11862,23 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,1,0,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,0,0,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,0,0,0,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,2,1]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,6,5,6,4]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,1,2,1]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,6,4]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm4
; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512DQ-NEXT: vpshufb %ymm5, %ymm13, %ymm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm5 = [20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0,20,21,16,17,28,29,0,0]
+; AVX512DQ-NEXT: vpshufb %ymm5, %ymm8, %ymm4
; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm27
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,5,4]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm8, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm28
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
@@ -11908,21 +11890,21 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,1,2,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,3,2,1]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm12[0,0,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[2,1,2,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,3,2,1]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,6,6]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,1,2,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,1,2,0,4,5,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,1,0,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,0,0,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,1,0,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,0,0,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,1,2,1]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm8[0,1,2,3,6,5,6,4]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[0,1,2,1]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm9[0,1,2,3,6,5,6,4]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm19
@@ -11941,100 +11923,102 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: # ymm5 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm1 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,1,0,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,2,1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,1,2,1]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,6,5,6,4]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm1[0,1,2,3],xmm14[4],xmm1[5,6],xmm14[7]
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm1 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm0
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm2 = mem[0,1],ymm1[2],mem[3],ymm1[4],mem[5,6],ymm1[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm28
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm19 ^ (zmm21 & (zmm2 ^ zmm19))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm18 & (zmm28 ^ zmm2))
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3,4,5,6,7],ymm1[8,9,10],ymm14[11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm2, %xmm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm19 ^ (zmm21 & (zmm0 ^ zmm19))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm18 & (zmm28 ^ zmm0))
; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm2
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
+; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm1
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm14
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3],xmm2[4],xmm14[5,6],xmm2[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm20
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3],xmm1[4],xmm14[5,6],xmm1[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm20
; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm0
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,1,4,5,6,7]
; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm0
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,1,3,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,7,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1,2],xmm2[3],xmm14[4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1,2],xmm1[3],xmm14[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm14, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,7,5,6,5]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm13, %ymm13
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,4,5]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm27
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,5,6,5]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm12[1,1,1,1,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,7,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2,3],xmm1[4],xmm11[5,6],xmm1[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm11 = [22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0,22,23,18,19,30,31,0,0]
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm8, %ymm12
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm8, %xmm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm27
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm21 & (zmm0 ^ zmm20))
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm18 & (zmm27 ^ zmm0))
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[3,1,2,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm8[0,1,2,3,7,5,6,5]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[1,1,1,1,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[3,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,1,3,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,7,7,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,7,5,6,5]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[1,1,1,1,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6],xmm2[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm0
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4],xmm8[5,6],xmm1[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,1,2,1,4,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,3,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm6[1,2],xmm2[3],xmm6[4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm6[1,2],xmm1[3],xmm6[4,5,6,7]
; AVX512DQ-NEXT: vpshufb %ymm14, %ymm5, %ymm5
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm5[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm2, %ymm4
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,4,5]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm21 & (zmm2 ^ zmm0))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm18 & (zmm20 ^ zmm2))
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm20
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm21 & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm18 & (zmm20 ^ zmm1))
; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512DQ-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm3 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm0
; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm13
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,2,2,2,4,5,6,7]
@@ -12047,7 +12031,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,0,2,4,5,6,7]
; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm19
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
-; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm11 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,8,9,12,13,0,1,12,13]
; AVX512DQ-NEXT: vpshufb %xmm11, %xmm5, %xmm4
; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm18
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4],xmm2[5],xmm4[6,7]
@@ -12141,12 +12125,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm21
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,2,3,2,3,14,15,14,15,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[1,1,2,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm11[1],xmm3[2,3],xmm11[4],xmm3[5,6,7]
-; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,10,11,10,11,2,3,14,15]
; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm0
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm13
@@ -12230,29 +12214,29 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-FCP-LABEL: load_i16_stride6_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $872, %rsp # imm = 0x368
+; AVX512DQ-FCP-NEXT: subq $1320, %rsp # imm = 0x528
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,12,13,0,1,4,5,8,9,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm17 = [6,5,4,7]
; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,1,0,3]
+; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm17, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm19
; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
; AVX512DQ-FCP-NEXT: vmovdqa 544(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,4,5,0,1,12,13,8,9,4,5]
; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm1
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm21
; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm2
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm22
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
@@ -12263,13 +12247,13 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm3
; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm2
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,1,0,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm21
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3],xmm3[4,5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm24
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3],xmm2[4,5],xmm3[6,7]
; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],mem[2,3]
; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vinserti128 $1, 480(%rdi), %ymm1, %ymm1
@@ -12277,17 +12261,17 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm25
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2],ymm1[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa 736(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm1
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm4
; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm19
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm26
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm2
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -12295,13 +12279,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vinserti128 $1, 672(%rdi), %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm26
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm18
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0,16,17,28,29,24,25,0,0]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm27
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,6]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,12,13,8,9,12,13,8,9,12,13,8,9,4,5]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm28
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm16 & (zmm3 ^ zmm0))
@@ -12313,504 +12298,498 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm17, %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm1
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm1
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7
; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm2
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5],xmm1[6],xmm2[7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm17
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm18
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,1,0,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm17, %ymm13
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],mem[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 96(%rdi), %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3,4,5],ymm1[6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm2[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3],xmm10[4,5],xmm2[6],xmm10[7]
-; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm0
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm10
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm0[3],xmm10[4,5],xmm0[6],xmm10[7]
+; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3],mem[2,3]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 288(%rdi), %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm30
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm17 ^ (zmm16 & (zmm6 ^ zmm17))
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm10
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm18 ^ (zmm16 & (zmm6 ^ zmm18))
; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,14,15,2,3,6,7,10,11,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm13
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm13[2],xmm0[3],xmm13[4,5],xmm0[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm0[2],xmm8[3],xmm0[4,5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,2,3,14,15,12,13,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3],xmm7[4,5],xmm5[6],xmm7[7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5
; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm9, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm6[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,5,5,5,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm0 ^ (zmm16 & (zmm6 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3],xmm6[4,5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm3
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5],xmm2[6],xmm1[7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3],xmm4[4,5],xmm2[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0,18,19,30,31,26,27,0,0]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7],ymm4[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,14,15,10,11,10,11,14,15,10,11,10,11,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm9, %xmm1
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm0 ^ (zmm16 & (zmm4 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm4 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm16 & (zmm7 ^ zmm5))
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3],xmm1[4,5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5],xmm5[6],xmm2[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4,5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm2[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,5,5,5,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm14, %ymm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm1 ^ (zmm16 & (zmm5 ^ zmm1))
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm23 = [4,7,6,5]
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,3,2,1]
+; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm23, %ymm15
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm25 = [10,1,10,3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,0,1,4,5,6,7,12,13,12,13,12,13,12,13]
; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermd 192(%rdi), %zmm25, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,0,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $36, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm19
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,6,5,6,4]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm21
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm20
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [2,9,8,11]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm19 = [12,5,14,5]
+; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm18, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,8,9,8,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm19, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,4]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5,6],xmm3[7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm2, %zmm3
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpblendd $219, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,1,2,3]
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm2 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm23
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[2,1,2,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm22
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm6 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm24
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm23, %ymm4
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,13,0,1,4,5,0,0,12,13,0,1,4,5,0,0]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm27
+; AVX512DQ-FCP-NEXT: vpermd (%rdi), %zmm25, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm6 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9,20,21,16,17,28,29,30,31,24,25,16,17,28,29,24,25]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm17
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4],ymm5[5],mem[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,0,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm25
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,6,5,6,4]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm18
+; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm21
+; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm18, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm19, %zmm14
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,6,5,6,4]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5,6],xmm6[7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm6
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm26
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm17
+; AVX512DQ-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1],ymm7[2],mem[3],ymm7[4],mem[5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm31
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,5,4]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,0,1,12,13,14,15,8,9,0,1,12,13,8,9]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm29
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm20 & (zmm4 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm24 & (zmm4 ^ zmm3))
; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm28 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm28 & (zmm5 ^ zmm4))
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm16
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm23, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm16
+; AVX512DQ-FCP-NEXT: vpermd 576(%rdi), %zmm25, %zmm11
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,1,2,0,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,1,0,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[0,1,2,1]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,6,5,6,4]
+; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm22
+; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm18, %zmm10
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm3
+; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm19, %zmm9
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm9[0,1,2,3,6,5,6,4]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[2,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,1,2,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm26
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,1,2,1]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,5,6,4]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3],xmm10[4],xmm0[5,6],xmm10[7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm23, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm0
+; AVX512DQ-FCP-NEXT: vpermd 384(%rdi), %zmm25, %zmm6
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,1,2,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm5 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm23
+; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm18, %zmm4
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm19, %zmm3
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,6,5,6,4]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1,2,3],xmm13[4],xmm0[5,6],xmm13[7]
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512DQ-FCP-NEXT: vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2],ymm10[3,4,5,6,7],ymm0[8,9,10],ymm10[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,5,4]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm27
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm9 ^ (zmm20 & (zmm1 ^ zmm9))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm28 & (zmm27 ^ zmm1))
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm26
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5,6],xmm10[7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm9, %zmm21
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm0
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0],xmm9[1,2],xmm15[3],xmm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm12
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7],ymm12[8,9,10],ymm13[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm13
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm31
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm26 ^ (zmm24 & (zmm1 ^ zmm26))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm28 & (zmm31 ^ zmm1))
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm18
+; AVX512DQ-FCP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm12 = mem[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm1[1,2],xmm12[3],xmm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,8,9,10,11,14,15,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm13
+; AVX512DQ-FCP-NEXT: vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm15 = mem[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5,6],xmm15[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm13, %zmm25
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm13 = [14,15,2,3,6,7,0,0,14,15,2,3,6,7,0,0]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm13, %xmm26
+; AVX512DQ-FCP-NEXT: vpshuflw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm15 = mem[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11,22,23,18,19,30,31,30,31,30,31,18,19,30,31,26,27]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2,3,4],xmm0[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm15
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7],ymm0[8,9,10],ymm14[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,4,5]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm19
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm21 ^ (zmm20 & (zmm1 ^ zmm21))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm28 & (zmm19 ^ zmm1))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,5,6,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5,6],xmm11[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm8, %ymm13
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7],ymm13[8,9,10],ymm15[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [6,7,2,3,14,15,14,15,14,15,2,3,14,15,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm19
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm25 ^ (zmm24 & (zmm0 ^ zmm25))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm28 & (zmm19 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1,2],xmm8[3],xmm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5,6],xmm9[7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1,2],xmm6[3],xmm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm5[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5,6],xmm3[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm3
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm1
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1,2],xmm7[3],xmm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5,6],xmm4[7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,4,5]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm21
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm20 & (zmm1 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm28 & (zmm21 ^ zmm1))
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm25
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm0 ^ (zmm24 & (zmm5 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm28 & (zmm25 ^ zmm5))
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm9 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm0
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,2,3,0,1,12,13,8,9,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm0
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm28
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $109, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6],ymm2[7]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm17
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm26 = [0,3,2,9]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,4,5,8,9,12,13,0,1,12,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm18
+; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm26, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm14 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm2, %zmm24
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm10 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm0
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm15
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd $36, (%rsp), %ymm0, %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm12 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm7
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,2,2,2,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ymm20) | ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,4,5,0,1,12,13,24,25,20,21,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm17
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = (ymm0 & ymm20) | ymm2
; AVX512DQ-FCP-NEXT: movw $31, %ax
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm22 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm0[1],ymm3[2,3,4,5],ymm0[6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm0, %zmm0, %zmm24 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm4 = ymm2[0],mem[1],ymm2[2,3,4,5],mem[6],ymm2[7]
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6],mem[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm18
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm29
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5],xmm0[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm27
+; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm26, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6,7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm26
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm30
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm23
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $66, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm6 = ymm0[0],mem[1],ymm0[2,3,4,5],mem[6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm5 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm0
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[2,2,2,2,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm25
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm31
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm6 = mem[0],ymm2[1],mem[2,3,4,5],ymm2[6],mem[7]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm4
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm8
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[2,2,2,2,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1,2,3],xmm4[4],xmm12[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm4, %zmm24
-; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm3, %zmm0, %zmm24 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4,5],ymm0[6],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm1 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm29
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm16
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm8
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[2,2,2,2,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd $109, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm3
+; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm26, %zmm5
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4],xmm15[5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm22
+; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm10, %zmm0, %zmm22 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm4 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6],mem[7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm12
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm13
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4],xmm13[5],xmm12[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm13
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm13
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm15
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3],xmm15[4],xmm10[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm26
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm14
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm14
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm9[0],xmm14[1],xmm9[2,3],xmm14[4],xmm9[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm15
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm15
+; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm26, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm10
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm11
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4],xmm11[5],xmm10[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm11
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4],ymm10[5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [10,11,6,7,2,3,14,15,10,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm7[1],xmm12[2,3],xmm7[4],xmm12[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm20) | ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm13
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm7[1],xmm13[2,3],xmm7[4],xmm13[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,10,11,14,15,2,3,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm15
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm10, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm10
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm14
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm14[0,1,2,3],xmm10[4],xmm14[5],xmm10[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm12, %zmm0, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm12
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm13
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4],xmm13[5],xmm12[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm15
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm15
; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm15
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm11
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm11[1],xmm5[2,3],xmm11[4],xmm5[5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm20) | ymm15
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3],xmm8[4],xmm2[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm6, %zmm2
-; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm5, %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm14
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm14[1],xmm2[2,3],xmm14[4],xmm2[5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm15
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3],xmm8[4],xmm9[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT: vinserti32x8 $0, %ymm2, %zmm0, %zmm5 {%k1}
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4],xmm1[5],xmm3[6,7]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rsi)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rdx)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm28 & (zmm23 ^ zmm22))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm28 & (zmm13 ^ zmm24))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm28 & (zmm10 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm28 & (zmm1 ^ zmm2))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 64(%rcx)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rsi)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rdx)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx)
+; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm2 = [18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm2 & (zmm21 ^ zmm24))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm2 & (zmm11 ^ zmm22))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm2 & (zmm12 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm2 & (zmm1 ^ zmm5))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 64(%rcx)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 64(%r8)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 64(%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%r9)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rax)
-; AVX512DQ-FCP-NEXT: addq $872, %rsp # imm = 0x368
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rax)
+; AVX512DQ-FCP-NEXT: addq $1320, %rsp # imm = 0x528
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 038c73bd9fed2..56bdda1d8e76a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -553,49 +553,49 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX2-NEXT: vmovdqa (%rdi), %xmm3
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm4
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm5
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm5[2],xmm3[3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm4[6],xmm1[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
; AVX2-NEXT: vmovdqa (%rdi), %xmm6
-; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],mem[2,3]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0],xmm6[1,2,3,4,5,6],xmm4[7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,2,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3,4,5,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7
+; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm5[2],xmm6[3]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6],xmm2[7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3,4,5,6],xmm4[7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[12,13,10,11,4,5,2,3,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
+; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7
; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm8
; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,1,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,6,4,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,6,4,6,7]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX2-NEXT: vmovq %xmm1, (%rsi)
-; AVX2-NEXT: vmovq %xmm6, (%rdx)
-; AVX2-NEXT: vmovq %xmm3, (%rcx)
-; AVX2-NEXT: vmovq %xmm4, (%r8)
-; AVX2-NEXT: vmovq %xmm5, (%r9)
+; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: vmovq %xmm2, (%rsi)
+; AVX2-NEXT: vmovq %xmm3, (%rdx)
+; AVX2-NEXT: vmovq %xmm4, (%rcx)
+; AVX2-NEXT: vmovq %xmm5, (%r8)
+; AVX2-NEXT: vmovq %xmm6, (%r9)
; AVX2-NEXT: vmovq %xmm7, (%r10)
; AVX2-NEXT: vmovq %xmm0, (%rax)
; AVX2-NEXT: vzeroupper
@@ -703,20 +703,20 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512: # %bb.0:
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: vmovdqa (%rdi), %xmm2
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512-NEXT: vmovdqa (%rdi), %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm3[2],xmm4[3]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6],xmm2[7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5,6,7]
; AVX512-NEXT: vmovdqa (%rdi), %ymm3
; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
@@ -805,20 +805,20 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2
-; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm4[2],xmm2[3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm3[2],xmm4[3]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6],xmm0[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,10,11,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6],xmm3[7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6],xmm2[7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,0,1,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5,6,7]
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
@@ -1283,8 +1283,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,4,7,7]
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,2,2,3,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,0,2]
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[0,1],xmm13[0,2]
; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
@@ -1303,7 +1302,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovdqa %xmm9, (%rcx)
; AVX-NEXT: vmovdqa %xmm10, (%r8)
; AVX-NEXT: vmovdqa %xmm11, (%r9)
-; AVX-NEXT: vmovdqa %xmm12, (%r10)
+; AVX-NEXT: vmovaps %xmm12, (%r10)
; AVX-NEXT: vmovdqa %xmm0, (%rax)
; AVX-NEXT: retq
;
@@ -1413,20 +1412,24 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm7
-; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm9
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm9[2,3]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm2[3],xmm8[4,5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3,4],xmm10[5,6,7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6,7]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
-; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4,5],xmm10[6,7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
+; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm8
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm6, %xmm6
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
+; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
+; AVX2-FP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
@@ -1448,15 +1451,15 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm12, %xmm14
; AVX2-FP-NEXT: vpshufb %xmm13, %xmm14, %xmm13
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
@@ -1465,7 +1468,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
; AVX2-FP-NEXT: vmovdqa %xmm5, (%rsi)
; AVX2-FP-NEXT: vmovdqa %xmm6, (%rdx)
-; AVX2-FP-NEXT: vmovdqa %xmm8, (%rcx)
+; AVX2-FP-NEXT: vmovdqa %xmm9, (%rcx)
; AVX2-FP-NEXT: vmovdqa %xmm10, (%r8)
; AVX2-FP-NEXT: vmovdqa %xmm11, (%r9)
; AVX2-FP-NEXT: vmovdqa %xmm7, (%r10)
@@ -1492,20 +1495,24 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm9
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm9[2,3]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm2[3],xmm8[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3,4],xmm10[5,6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm2[4],xmm8[5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
-; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3,4,5],xmm10[6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm7[0,1],xmm8[2,3]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2],xmm2[3],xmm9[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7]
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3,4],xmm6[5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm2[4],xmm9[5,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
+; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
+; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4],xmm9[5,6,7]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2],xmm1[3]
; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
@@ -1527,15 +1534,15 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14
; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm7[1,2,3,4,5,6],xmm2[7]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
@@ -1544,7 +1551,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rsi)
; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rdx)
-; AVX2-FCP-NEXT: vmovdqa %xmm8, (%rcx)
+; AVX2-FCP-NEXT: vmovdqa %xmm9, (%rcx)
; AVX2-FCP-NEXT: vmovdqa %xmm10, (%r8)
; AVX2-FCP-NEXT: vmovdqa %xmm11, (%r9)
; AVX2-FCP-NEXT: vmovdqa %xmm7, (%r10)
@@ -1641,75 +1648,79 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm0
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3]
-; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,1,2,3]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm2
+; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm1
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4],xmm2[5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [4,1]
+; AVX512-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm1[3],xmm6[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3,4],xmm8[5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3,4],xmm7[5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4],xmm6[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4,5],xmm7[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,10,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm8
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3,4],xmm7[5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm1[6],xmm8[7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm11
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11
; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX512-FCP-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm11
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [0,5]
+; AVX512-FCP-NEXT: vpermq %zmm0, %zmm10, %zmm10
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0],xmm10[1,2,3,4,5,6],xmm1[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [8,1,10,11]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa %xmm8, (%r8)
-; AVX512-FCP-NEXT: vmovdqa %xmm9, (%r9)
-; AVX512-FCP-NEXT: vmovdqa %xmm10, (%r10)
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %xmm7, (%r8)
+; AVX512-FCP-NEXT: vmovdqa %xmm8, (%r9)
+; AVX512-FCP-NEXT: vmovdqa %xmm9, (%r10)
; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -1803,75 +1814,79 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm1[1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,14,15,12,13,10,11,8,9,12,13,10,11,4,5]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm7[4],xmm6[5],xmm7[6],xmm6[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4],xmm2[5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [4,1]
+; AVX512DQ-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm1[3],xmm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4],xmm3[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm2[3],xmm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3,4],xmm8[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4],xmm7[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4,5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm0[2],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2],xmm11[3],xmm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3,4],xmm7[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,14,15,12,13,10,11,8,9,6,7,4,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4],xmm6[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4,5],xmm7[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,5,2,3,0,1,14,15,12,13,10,11,8,9,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,10,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm8
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [6,7,4,5,2,3,0,1,14,15,8,9,6,7,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm9
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2,3,4],xmm7[5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm1[6],xmm8[7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm11
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11
; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3,4],xmm8[5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm12
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm11
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0],xmm11[1,2,3,4,5,6],xmm2[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [0,5]
+; AVX512DQ-FCP-NEXT: vpermq %zmm0, %zmm10, %zmm10
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0],xmm10[1,2,3,4,5,6],xmm1[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [8,1,10,11]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, (%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, (%r10)
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, (%r10)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -2820,9 +2835,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3],xmm11[4],xmm12[5,6,7]
; AVX2-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX2-NEXT: vmovdqa 208(%rdi), %xmm12
-; AVX2-NEXT: vmovdqa 192(%rdi), %xmm13
-; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
+; AVX2-NEXT: vmovdqa 192(%rdi), %xmm12
+; AVX2-NEXT: vmovdqa 208(%rdi), %xmm13
+; AVX2-NEXT: vpblendw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,5],xmm13[6],xmm12[7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,4,7,6]
; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
@@ -2844,7 +2859,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm12[1],xmm13[2,3,4,5,6,7]
; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
@@ -2965,7 +2980,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %xmm11, %xmm13, %xmm14
; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm13
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
@@ -2989,9 +3004,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0,1],xmm4[2],xmm12[3],xmm4[4],xmm12[5,6,7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm12
-; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm13
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
+; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm12
+; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm13
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,5],xmm13[6],xmm12[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
; AVX2-FP-NEXT: vpshufb %xmm15, %xmm14, %xmm14
; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
@@ -3003,7 +3018,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm14, %xmm15
; AVX2-FP-NEXT: vpshufb %xmm11, %xmm15, %xmm11
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1,2,3,4,5,6,7],ymm7[8],ymm4[9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3]
@@ -3013,7 +3028,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm13[0],xmm12[1],xmm13[2,3,4,5,6,7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -3024,7 +3039,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
@@ -3132,7 +3147,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm12[1,2,3,4,5,6,7],ymm10[8],ymm12[9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm13 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm14
; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
@@ -3168,7 +3183,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm15, %xmm8
; AVX2-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm8
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm4[1,2,3,4,5,6,7],ymm13[8],ymm4[9,10,11,12,13,14,15]
@@ -3191,7 +3206,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,10,11,6,7,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
@@ -3384,150 +3399,149 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-FCP-LABEL: load_i16_stride7_vf16:
; AVX512-FCP: # %bb.0:
+; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,12,2,5,9,12]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,9,0,13,0,0,0]
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm9
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,5,9,0,12,0,0,0]
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm7
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[0,1,0,2]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm3
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,6,10,13,3,6,10,13]
-; AVX512-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm5
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[0,1,0,2]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5],xmm15[6],xmm13[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %ymm3, %ymm13, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm12
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6],xmm15[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm10
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5],xmm9[6],xmm7[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,5,9,0,12,0,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm7
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %ymm7, %ymm14, %ymm7
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1,2,3,4,5,6,7],ymm7[8],ymm1[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
-; AVX512-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm14
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3,4,5],xmm9[6],xmm12[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3,4,5],xmm7[6],xmm9[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,6,9,0,13,0,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm9, %zmm9
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %ymm9, %ymm12, %ymm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4,5,6,7],ymm9[8],ymm7[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm2[0,1,1,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [10,3,6,15,12,13,6,15]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm13, %zmm13
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %ymm9, %ymm14, %ymm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6,7],ymm9[8],ymm1[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3,4,5],xmm14[6],xmm1[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm5[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2],xmm14[3],xmm8[4,5,6,7]
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm10
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %ymm10, %ymm8, %ymm8
-; AVX512-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm14
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2,3,4,5,6,7],ymm13[8],ymm9[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm13
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm10[1,2,3,4,5,6,7],ymm1[8],ymm10[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,0,0,3,7,0]
-; AVX512-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm12
-; AVX512-FCP-NEXT: vpermd %zmm2, %zmm16, %zmm2
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,4,7,11,14,0,0,0]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm13
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm11
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm14
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3,4,5,6,7],ymm11[8],ymm8[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,3,0,0,3,7,0]
+; AVX512-FCP-NEXT: vpermd %ymm2, %ymm11, %ymm11
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm11
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,5,9,12,2,5,9,12]
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm14
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7],ymm12[8,9,10,11,12],ymm11[13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,4,7,11,14,0,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm13, %zmm13
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6,7],ymm12[8],ymm1[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0]
; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm5
+; AVX512-FCP-NEXT: vpermd %ymm2, %ymm12, %ymm2
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5
; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,4,8,11,15,0,0,0]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15]
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm12, %zmm0
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [2,6,9,13,2,6,9,13]
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm13, %zmm1
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,4,8,11,15,0,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %ymm9, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r8)
-; AVX512-FCP-NEXT: vmovdqa %ymm10, (%r9)
-; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax)
-; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %ymm9, (%r8)
+; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r9)
+; AVX512-FCP-NEXT: vmovdqa %ymm11, (%r10)
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -3705,150 +3719,149 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-FCP-LABEL: load_i16_stride7_vf16:
; AVX512DQ-FCP: # %bb.0:
+; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,6,9,13,2,6,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,12,2,5,9,12]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [10,3,6,15,12,13,6,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,6,9,0,13,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm9
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,5,9,0,12,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm7
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm2[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm3
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,6,10,13,3,6,10,13]
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm5[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm6[2],ymm4[3,4,5],ymm6[6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm15[4],xmm13[5],xmm15[6],xmm13[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm13, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1],ymm12[2],ymm13[3,4,5],ymm12[6],ymm13[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1,2,3],xmm1[4],xmm15[5],xmm1[6],xmm15[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm10[4],xmm9[5],xmm10[6],xmm9[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm9[4],xmm7[5],xmm9[6],xmm7[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,5,9,0,12,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm7
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3,4,5],xmm14[6],xmm15[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm14, %ymm7
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1,2,3,4,5,6,7],ymm7[8],ymm1[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3,4,5],xmm1[6],xmm14[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [2,5,2,5,2,5,2,5]
-; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm14, %ymm14
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2,3,4,5],xmm9[6],xmm12[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0],xmm7[1],xmm9[2,3,4,5],xmm7[6],xmm9[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[0,1,1,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,6,9,0,13,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm9, %zmm9
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %ymm9, %ymm12, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0],ymm7[1,2,3,4,5,6,7],ymm9[8],ymm7[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5,6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm2[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [10,3,6,15,12,13,6,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm13, %zmm13
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5],xmm15[6],xmm14[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm9, %ymm14, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm1[1,2,3,4,5,6,7],ymm9[8],ymm1[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3,4,5],xmm14[6],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm5[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm14[1],xmm8[2],xmm14[3],xmm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm10
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm10, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm14
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm1[1,2,3,4,5,6,7],ymm8[8],ymm1[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2],xmm14[3],xmm15[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm14, %ymm13
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2,3,4,5,6,7],ymm13[8],ymm9[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2,3,4],ymm11[5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm15, %ymm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm13
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm10[1,2,3,4,5,6,7],ymm1[8],ymm10[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,3,3,0,0,3,7,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm14, %ymm13
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7],ymm13[8,9,10,11,12],ymm1[13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1],ymm4[2,3],ymm6[4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm12
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm16, %zmm2
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,4,7,11,14,0,0,0]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm13
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm10, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm14
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3,4,5,6,7],ymm11[8],ymm8[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,3,0,0,3,7,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm14
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5,6,7],ymm12[8,9,10,11,12],ymm11[13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,4,7,11,14,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm13, %zmm13
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm1[1,2,3,4,5,6,7],ymm12[8],ymm1[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0]
; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm12, %ymm5
+; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm12, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5
; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,4,8,11,15,0,0,0]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15]
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm12, %zmm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm13, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,4,8,11,15,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%r9)
-; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax)
-; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, (%r10)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -5839,9 +5852,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2
-; AVX2-NEXT: vmovdqa 432(%rdi), %xmm3
-; AVX2-NEXT: vmovdqa 416(%rdi), %xmm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
+; AVX2-NEXT: vmovdqa 416(%rdi), %xmm3
+; AVX2-NEXT: vmovdqa 432(%rdi), %xmm0
+; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,5],xmm0[6],xmm3[7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
@@ -5864,9 +5877,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1],xmm1[2],xmm7[3],xmm1[4],xmm7[5,6,7]
; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX2-NEXT: vmovdqa 208(%rdi), %xmm11
-; AVX2-NEXT: vmovdqa 192(%rdi), %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7]
+; AVX2-NEXT: vmovdqa 192(%rdi), %xmm11
+; AVX2-NEXT: vmovdqa 208(%rdi), %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,5],xmm1[6],xmm11[7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,7,6]
; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
@@ -5896,7 +5909,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm6
; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5,6,7]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
@@ -5922,7 +5935,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm8
; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3],xmm8[4],xmm4[5],xmm8[6,7]
; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0],xmm1[1],xmm11[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3,4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
@@ -6183,7 +6196,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm14 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm3
; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
@@ -6243,18 +6256,18 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8
-; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm3
-; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,5],xmm3[6],xmm0[7]
+; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm3
+; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4,5],xmm0[6],xmm3[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13]
; AVX2-FP-NEXT: vpshufb %xmm13, %xmm10, %xmm10
; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm10, %xmm14
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm14
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
; AVX2-FP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
@@ -6266,9 +6279,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm10
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3],xmm8[4],xmm10[5,6,7]
; AVX2-FP-NEXT: vpshufb %xmm7, %xmm8, %xmm10
-; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm15
-; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm12
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5],xmm15[6],xmm12[7]
+; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm15
+; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm12
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,5],xmm12[6],xmm15[7]
; AVX2-FP-NEXT: vpshufb %xmm13, %xmm5, %xmm5
; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -6284,7 +6297,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm2[0,1],ymm8[2,3],ymm2[4,5],ymm8[6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm13, %xmm14
; AVX2-FP-NEXT: vpshufb %xmm1, %xmm14, %xmm14
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3,4,5,6,7],ymm10[8],ymm5[9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3]
@@ -6297,7 +6310,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm6
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5,6,7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
@@ -6309,7 +6322,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm10
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
; AVX2-FP-NEXT: vpshufb %xmm9, %xmm10, %xmm10
; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
@@ -6324,7 +6337,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4],xmm4[5],xmm5[6,7]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm12[1],xmm15[2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm12[0],xmm15[1],xmm12[2,3,4,5,6,7]
; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
@@ -6582,7 +6595,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -6609,7 +6622,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
@@ -6640,9 +6653,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm2 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm15
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm9
@@ -6672,8 +6685,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm8
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
@@ -6698,7 +6711,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm8 = mem[0,1,2],ymm7[3],mem[4,5],ymm7[6],mem[7]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
@@ -6764,8 +6777,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa (%rdi), %ymm8
; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4
; AVX512-NEXT: vmovdqa 64(%rdi), %ymm12
-; AVX512-NEXT: vmovdqa 96(%rdi), %ymm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7]
+; AVX512-NEXT: vmovdqa 96(%rdi), %ymm10
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7]
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm4[2],ymm8[3,4,5],ymm4[6],ymm8[7]
@@ -6774,8 +6787,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm21
; AVX512-NEXT: vmovdqa 128(%rdi), %ymm9
-; AVX512-NEXT: vmovdqa 160(%rdi), %ymm10
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7]
+; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,6,4,7]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
@@ -6799,7 +6812,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa 288(%rdi), %ymm6
; AVX512-NEXT: vmovdqa 240(%rdi), %xmm14
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7]
; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15]
; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
@@ -6808,7 +6821,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7]
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
@@ -6820,7 +6833,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm12[2,3],ymm10[4,5],ymm12[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -6829,7 +6842,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vporq %ymm1, %ymm2, %ymm19
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7]
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
@@ -6845,7 +6858,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27
; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm23
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7]
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm8[1],ymm4[2,3,4],ymm8[5],ymm4[6,7]
@@ -6856,7 +6869,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vporq %ymm1, %ymm2, %ymm20
; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm0[0,1,1,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
@@ -6868,7 +6881,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpsrlq $48, %xmm14, %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm18
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
@@ -6903,8 +6916,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7
; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11,8,9,4,5,8,9,10,11,12,13,14,15]
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
@@ -6921,10 +6933,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm8[2,3],ymm13[4,5],ymm8[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,12,13,14,15]
; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3]
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
@@ -6948,7 +6959,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm28
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6,7]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
@@ -6978,7 +6989,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0,1,2],ymm3[3,4,5,6,7],ymm14[8,9,10],ymm3[11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm24
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm9[2,3],ymm11[4,5],ymm9[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm9
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3],xmm9[4],xmm0[5],xmm9[6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5,6,7]
@@ -6994,7 +7005,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm16
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm11[2],ymm12[3,4,5],ymm11[6],ymm12[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7]
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7,8,9,10,11,12,13],ymm1[14],ymm0[15]
@@ -7011,18 +7022,18 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm15[0,1,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3],ymm2[4,5,6,7,8,9,10],ymm10[11],ymm2[12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm14
-; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm14[1],xmm10[2,3,4,5],xmm14[6],xmm10[7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm15[0,1,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5,6,7,8,9,10],ymm11[11],ymm2[12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm14
+; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm14[1],xmm11[2,3,4,5],xmm14[6],xmm11[7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[10,11,24,25,22,23,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
-; AVX512-NEXT: vpor %ymm2, %ymm10, %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
+; AVX512-NEXT: vpor %ymm2, %ymm11, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6,7]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
@@ -7039,7 +7050,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
; AVX512-NEXT: vpor %ymm2, %ymm9, %ymm2
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7]
; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1,2,3,4,5,6],ymm9[7,8],ymm2[9,10,11,12,13,14],ymm9[15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7]
@@ -7051,14 +7062,14 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3]
; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1],ymm9[2,3,4,5,6,7,8],ymm11[9],ymm9[10,11,12,13,14,15]
-; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm11
+; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6,7,8],ymm10[9],ymm9[10,11,12,13,14,15]
+; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm10
; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm12
; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm14
; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm0, %zmm25
-; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm10
+; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm11
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm26
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,6,4,6,7]
@@ -7087,7 +7098,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm4 & (zmm11 ^ zmm21))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm4 & (zmm10 ^ zmm21))
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm5 ^ (mem & (zmm22 ^ zmm5))
; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm4 & (zmm12 ^ zmm22))
@@ -7099,8 +7110,8 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm4 & (zmm27 ^ zmm20))
; AVX512-NEXT: vmovdqa32 %zmm25, %zmm27 {%k1}
; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm18 ^ (zmm4 & (zmm1 ^ zmm18))
-; AVX512-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1}
-; AVX512-NEXT: vmovdqa64 %zmm11, (%rsi)
+; AVX512-NEXT: vmovdqa32 %zmm11, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm10, (%rsi)
; AVX512-NEXT: vmovdqa64 %zmm12, (%rdx)
; AVX512-NEXT: vmovdqa64 %zmm23, (%rcx)
; AVX512-NEXT: vmovdqa64 %zmm27, (%r8)
@@ -7118,312 +7129,317 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-FCP-LABEL: load_i16_stride7_vf32:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm29
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm22
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13]
-; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm9
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0]
-; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm13
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,0,0,0,4,8,11,15]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,5,9,0,12,0,0,0]
-; AVX512-FCP-NEXT: vpermd %zmm31, %zmm0, %zmm4
-; AVX512-FCP-NEXT: vpermd %zmm9, %zmm16, %zmm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,4,7,11,14]
-; AVX512-FCP-NEXT: vpermd %zmm29, %zmm2, %zmm2
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,1,12,5,12,5,14,15]
-; AVX512-FCP-NEXT: vpermd %zmm31, %zmm5, %zmm5
-; AVX512-FCP-NEXT: vpermd %zmm22, %zmm19, %zmm6
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm28
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm28[0,1,0,2]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm15, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vporq %ymm8, %ymm11, %ymm23
-; AVX512-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm8
-; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm12
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm30
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm30[0,1,0,2]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [3,6,10,13,3,6,10,13]
+; AVX512-FCP-NEXT: vpermd %zmm26, %zmm21, %zmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15]
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vporq %ymm6, %ymm7, %ymm24
+; AVX512-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm6
+; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm11
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512-FCP-NEXT: movw $992, %ax # imm = 0x3E0
; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm23 {%k1}
-; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm7
-; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm14
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3],xmm14[4],xmm11[5],xmm14[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm26
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm24 {%k1}
+; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm6
+; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3],xmm8[4],xmm3[5],xmm8[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,7,11,14]
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm8, %zmm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4,5,6],xmm8[7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm25
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12]
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm18, %zmm3
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm20 = [4,5,6,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm25
; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vporq %ymm4, %ymm0, %ymm20
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm4
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm4[2],ymm11[3,4,5],ymm4[6],ymm11[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm12[1],xmm14[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm24
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vporq %ymm0, %ymm2, %ymm21
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3,4,5],xmm3[6],xmm8[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,5,9,0,12,0,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm8, %zmm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpor %ymm3, %ymm8, %ymm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm10
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1,2,3],xmm10[4],xmm15[5],xmm10[6],xmm15[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm12
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm27
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6],xmm12[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,8,11,15]
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm12, %zmm12
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5,6],xmm12[7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm28
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,6,9,13,2,6,9,13]
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm0, %zmm10
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
+; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm28
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,6,9,0,13,0,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm10, %zmm10
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vporq %ymm10, %ymm12, %ymm23
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4,5],xmm10[6],xmm12[7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,5,2,5,2,5,2,5]
-; AVX512-FCP-NEXT: vpermd %ymm28, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6],xmm3[7]
-; AVX512-FCP-NEXT: vpermd %zmm29, %zmm18, %zmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4,5,6],xmm3[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vpermd %zmm9, %zmm10, %zmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [10,3,6,15,12,13,6,15]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,0,0,0,5,8,12,15]
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpermd %zmm29, %zmm18, %zmm13
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %ymm2, %ymm13, %ymm2
-; AVX512-FCP-NEXT: vpermd %zmm9, %zmm19, %zmm9
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm9[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm9[0],xmm2[1],xmm9[2],xmm2[3],xmm9[4,5,6,7]
-; AVX512-FCP-NEXT: vpermd %zmm31, %zmm3, %zmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vporq %ymm3, %ymm2, %ymm18
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm11[1],ymm4[2,3],ymm11[4],ymm4[5,6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm28[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm19
-; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm13
-; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm13[2],ymm3[3,4,5],ymm13[6],ymm3[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,11,2,11,12,5,8,9]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpermd %zmm29, %zmm10, %zmm10
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm10
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm30[0,1,1,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm10, %zmm16
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm15
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm15[4],xmm10[5],xmm15[6],xmm10[7]
+; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm10
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [1,0,0,0,5,8,12,15]
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm13, %zmm13
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,16,17,22,23,24,25,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpor %ymm13, %ymm10, %ymm10
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm17
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm21, %zmm1
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm17
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15]
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2],xmm10[3],xmm13[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vporq %ymm1, %ymm10, %ymm20
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm10
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3,4,5],xmm10[6],xmm1[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm30[0,1,1,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm10[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm21
+; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm15
+; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,11,2,11,12,5,8,9]
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm12, %zmm12
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
-; AVX512-FCP-NEXT: vpor %ymm10, %ymm9, %ymm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2],xmm0[3],xmm4[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpsrlq $48, %xmm14, %xmm4
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX512-FCP-NEXT: vpermd %zmm22, %zmm16, %zmm4
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm16
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,0,0,3,7,0]
-; AVX512-FCP-NEXT: vpermd %ymm28, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
-; AVX512-FCP-NEXT: vpsrld $16, %xmm12, %xmm2
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm25
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm0
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm2
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-FCP-NEXT: vpermd %zmm31, %zmm10, %zmm4
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm30 & (zmm16 ^ zmm0))
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3,4,5],xmm10[6],xmm4[7]
+; AVX512-FCP-NEXT: vpor %ymm12, %ymm9, %ymm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm19
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm0
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,10,14,0,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm9, %zmm9
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9
+; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm22
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2],xmm3[3],xmm8[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm8
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm13, %ymm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7]
+; AVX512-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm8
+; AVX512-FCP-NEXT: vpsrlq $48, %xmm14, %xmm9
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm3, %zmm8
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm29 & (zmm8 ^ zmm0))
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm15[3],ymm1[4,5],ymm15[6],ymm1[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3,4,5],xmm9[6],xmm3[7]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,0,0,0,6,9,13,0]
-; AVX512-FCP-NEXT: vpermd %zmm29, %zmm10, %zmm10
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
-; AVX512-FCP-NEXT: vpor %ymm4, %ymm10, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,4,7,11,14,0,0,0]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm26
-; AVX512-FCP-NEXT: vpermd %zmm31, %zmm10, %zmm10
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm10[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm3[1],ymm13[2,3],ymm3[4],ymm13[5,6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3,4,5],xmm10[6],xmm4[7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm27
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,4,7,0,0,4,7,0]
-; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm28, %ymm10, %ymm10
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm28
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm10
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,6,9,13,2,6,9,13]
-; AVX512-FCP-NEXT: vpermd %zmm22, %zmm9, %zmm9
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm10[5,6,7],ymm1[8,9,10,11,12],ymm10[13,14,15]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,0,0,0,6,9,13,0]
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm9, %zmm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
+; AVX512-FCP-NEXT: vpor %ymm3, %ymm9, %ymm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,0,0,3,7,0]
+; AVX512-FCP-NEXT: vpermd %ymm30, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpermd %zmm26, %zmm18, %zmm9
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15]
+; AVX512-FCP-NEXT: vpsrld $16, %xmm11, %xmm3
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm18
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,7,11,14,0,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1],ymm3[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,4,7,0,0,4,7,0]
+; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpermd %ymm30, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,6,9,13,2,6,9,13]
+; AVX512-FCP-NEXT: vpermd %zmm26, %zmm10, %zmm10
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm10
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm11
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm10
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm9[0],xmm1[1],xmm9[2],xmm1[3],xmm9[4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,0,0,0,6,10,13,0]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm12
-; AVX512-FCP-NEXT: vpermd %zmm29, %zmm9, %zmm9
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1,2],ymm4[3,4,5,6,7],ymm9[8,9,10],ymm4[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
-; AVX512-FCP-NEXT: vpor %ymm1, %ymm9, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,8,11,15,0,0,0]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9
-; AVX512-FCP-NEXT: vpermd %zmm31, %zmm4, %zmm0
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm13
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2],xmm0[3],xmm13[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [3,0,0,0,6,10,13,0]
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm13, %zmm13
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1,2],ymm3[3,4,5,6,7],ymm13[8,9,10],ymm3[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm13, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,8,11,15,0,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm3[1],ymm13[2,3,4],ymm3[5],ymm13[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,4,6,7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,10,3,14,7,10,3]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512-FCP-NEXT: vpermd %zmm29, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm4, %zmm4
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm3 & (zmm26 ^ zmm23))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm20 ^ (mem & (zmm24 ^ zmm20))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm3 & (zmm27 ^ zmm24))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm30 & (zmm15 ^ zmm21))
-; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
-; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vmovdqa32 %zmm28, %zmm15 {%k1}
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm30 & (zmm19 ^ zmm18))
-; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm19 {%k1}
-; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm16 {%k1}
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm25 ^ (zmm30 & (zmm11 ^ zmm25))
-; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm11 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r9)
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm10 ^ (zmm30 & (zmm0 ^ zmm10))
-; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm4 & (zmm3 ^ zmm24))
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm5 ^ (mem & (zmm27 ^ zmm5))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm4 & (zmm5 ^ zmm27))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm29 & (zmm16 ^ zmm23))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm4
+; AVX512-FCP-NEXT: movw $-512, %di # imm = 0xFE00
+; AVX512-FCP-NEXT: kmovw %edi, %k1
+; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm16 {%k1}
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm29 & (zmm21 ^ zmm20))
+; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm21 {%k1}
+; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm8 {%k1}
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm18 ^ (zmm29 & (zmm12 ^ zmm18))
+; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm12 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm9 ^ (zmm29 & (zmm2 ^ zmm9))
+; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride7_vf32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm15
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm7
; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm10
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm15[2],ymm11[3,4,5],ymm15[6],ymm11[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm12[2],ymm11[3,4,5],ymm12[6],ymm11[7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u,u,u,u,u]
@@ -7444,22 +7460,22 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
; AVX512DQ-NEXT: vpbroadcastw 252(%rdi), %xmm3
; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm13
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[0,1,0,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[0,1,0,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512DQ-NEXT: movw $992, %ax # imm = 0x3E0
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm18 {%k1}
-; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm6
-; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm5
+; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm5
+; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm15
; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm14
; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm4[2],ymm10[3,4],ymm4[5],ymm10[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm7[2],ymm10[3,4],ymm7[5],ymm10[6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6,7,8,9,10],ymm2[11],ymm3[12,13,14,15]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[8,9,6,7,4,5,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm15[3],ymm11[4,5],ymm15[6],ymm11[7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2,3,4,5],xmm3[6],xmm7[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[2,3,0,1,14,15,12,13,10,11],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -7475,11 +7491,11 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,0,3,3,4,5,6,7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm21
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm7[2,3],ymm10[4,5],ymm7[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7,8,9,10,11],ymm2[12],ymm1[13,14,15]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[10,11,8,9,6,7,20,21],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm11[1],ymm15[2,3],ymm11[4],ymm15[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -7495,19 +7511,19 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,0,3,4,5,4,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm26
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm22
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm7[3],ymm10[4,5],ymm7[6],ymm10[7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,2,3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm11[1],ymm12[2,3,4],ymm11[5],ymm12[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm26
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vporq %ymm1, %ymm2, %ymm17
+; AVX512DQ-NEXT: vporq %ymm1, %ymm2, %ymm27
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm23 = ymm0[0,1,1,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -7527,138 +7543,138 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,6,7,4,5,2,3,u,u,u,u,u,u]
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm2
-; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm7
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5],xmm2[6],xmm7[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm29
+; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm2[6],xmm3[7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm17
+; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm30
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,7,6]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
; AVX512DQ-NEXT: vpsrld $16, %xmm13, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm27
+; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm28
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm28
+; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm29
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24
-; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm12
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm6[2,3],ymm5[4,5],ymm6[6,7]
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm6
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm5[2,3],ymm15[4,5],ymm5[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
-; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
+; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm11
; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm14
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm1[3],ymm14[4,5],ymm1[6],ymm14[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7,8,9,10,11,12,13],ymm3[14],ymm2[15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,1]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6],xmm2[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm5
; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm13
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2],ymm13[3,4],ymm5[5],ymm13[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,8,9,4,5,8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm30
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm6[3],ymm12[4,5],ymm6[6],ymm12[7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm31
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm6[3],ymm15[4,5],ymm6[6],ymm15[7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6,7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6],ymm3[7,8],ymm2[9,10,11,12,13,14],ymm3[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6],xmm2[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm31
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6,7,8],ymm2[9],ymm0[10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm12[2],ymm6[3,4,5],ymm12[6],ymm6[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm16
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,7,20,21,u,u,16,17,30,31,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7,8],ymm2[9],ymm1[10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm15[2],ymm6[3,4,5],ymm15[6],ymm6[7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~mem)
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
+; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & ~mem)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,4]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,7]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm25 & (zmm22 ^ zmm19))
; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512DQ-NEXT: kmovw %eax, %k1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm22 {%k1}
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm22 {%k1}
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,1,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,2,0,4,5,6,4]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm2
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,2]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm19
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm1, %zmm19
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm23
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm4
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm13[2],ymm5[3,4,5],ymm13[6],ymm5[7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2],ymm1[3,4],ymm14[5],ymm1[6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,1,2,0]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpor %ymm3, %ymm1, %ymm1
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7],ymm3[8,9,10],ymm2[11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm25 & (zmm19 ^ zmm17))
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm19 {%k1}
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm10[2],ymm4[3,4,5],ymm10[6],ymm4[7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7,8,9,10,11,12,13],ymm2[14],ymm0[15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm23
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm26
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm25 & (zmm19 ^ zmm27))
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm19 {%k1}
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm10[2],ymm7[3,4,5],ymm10[6],ymm7[7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm27
+; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm6
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm6[2],ymm12[3,4],ymm6[5],ymm12[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm13[3],ymm5[4,5],ymm13[6],ymm5[7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm14[2,3],ymm1[4,5],ymm14[6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm1[0,1,0,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm11[0,1,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7,8,9,10],ymm15[11],ymm3[12,13,14,15]
-; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm12
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2,3,4,5],xmm12[6],xmm15[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
@@ -7668,16 +7684,16 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
; AVX512DQ-NEXT: vpor %ymm3, %ymm12, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm20 ^ (zmm25 & (zmm11 ^ zmm20))
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm11 {%k1}
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm20 ^ (zmm25 & (zmm1 ^ zmm20))
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2],ymm14[3],ymm11[4,5],ymm14[6],ymm11[7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm3[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4],ymm3[5,6,7,8,9,10,11],ymm12[12],ymm3[13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm20
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm20
; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm15
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1],xmm15[2],xmm12[3],xmm15[4,5,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
@@ -7687,33 +7703,32 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
; AVX512DQ-NEXT: vpor %ymm3, %ymm12, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm17
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm10[3],ymm4[4,5],ymm10[6],ymm4[7]
-; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3
+; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm26
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm2[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm2[1,2,3,4,5,6],ymm12[7,8],ymm2[9,10,11,12,13,14],ymm12[15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm8
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm8
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm9
+; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm15
+; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm15[0],xmm0[1],xmm15[2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[8,9,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0,1,2,3,4],ymm7[5,6,7],ymm12[8,9,10,11,12],ymm7[13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm4
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7],ymm12[8,9,10,11,12],ymm4[13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm12
+; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm15
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm12, %zmm7, %zmm27
-; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm7
-; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7]
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm12, %zmm4, %zmm17
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm15
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,2,3,4,5,6,7]
@@ -7723,48 +7738,49 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3,4,5,6,7]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm24 ^ (zmm25 & (zmm2 ^ zmm24))
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm2 {%k1}
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm3[1],ymm10[2,3],ymm3[4],ymm10[5,6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5,6,7,8],ymm4[9],ymm3[10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm0[3],ymm7[4,5],ymm0[6],ymm7[7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,6,4,6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm26, %zmm0, %zmm2 {%k1}
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5,6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6,7,8],ymm7[9],ymm3[10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1,2],ymm6[3],ymm4[4,5],ymm6[6],ymm4[7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,6,4,6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,5,7,6,7]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0],xmm4[1],xmm10[2],xmm4[3],xmm10[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm10
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm10[0],xmm7[1],xmm10[2],xmm7[3],xmm10[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm6[2],ymm0[3,4],ymm6[5],ymm0[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,0,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2],ymm14[3,4,5],ymm1[6],ymm14[7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,3,1]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm27 ^ (zmm25 & (zmm3 ^ zmm27))
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1}
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm1 & (zmm8 ^ zmm18))
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm0 ^ (mem & (zmm21 ^ zmm0))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm1 & (zmm9 ^ zmm21))
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4,5],ymm11[6],ymm14[7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,3,1]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1,2],ymm0[3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0,1,2],ymm5[3,4,5,6,7],ymm0[8,9,10],ymm5[11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm17 ^ (zmm25 & (zmm3 ^ zmm17))
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm18))
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm4 ^ (mem & (zmm21 ^ zmm4))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm0 & (zmm9 ^ zmm21))
; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi)
; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%rcx)
; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%r8)
-; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r9)
+; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%r9)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
@@ -7774,295 +7790,290 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-FCP-LABEL: load_i16_stride7_vf32:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm30
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm22
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,5,9,12,2,5,9,12]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,6,9,0,13,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm4
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,4,8,11,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [2,5,9,0,12,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm18, %zmm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,4,7,11,14]
-; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,1,12,5,12,5,14,15]
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm19, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm25
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm25[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm29
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm6[2],ymm5[3,4,5],ymm6[6],ymm5[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm13[4],xmm11[5],xmm13[6],xmm11[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vporq %ymm8, %ymm11, %ymm23
-; AVX512DQ-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm13
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm26
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm30
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm30[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm29 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm29, %zmm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [8,1,12,5,12,5,14,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,12,13,2,3,16,17,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm5[2],ymm4[3,4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm8[4],xmm7[5],xmm8[6],xmm7[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,1,14,15,12,13,10,11,8,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vporq %ymm6, %ymm7, %ymm24
+; AVX512DQ-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm11
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,u,u,0,1,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512DQ-FCP-NEXT: movw $992, %ax # imm = 0x3E0
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm23 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm14[0,1,2],xmm11[3],xmm14[4],xmm11[5],xmm14[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27
-; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vporq %ymm10, %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm11[2],ymm10[3,4,5],ymm11[6],ymm10[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm9[4],xmm0[5],xmm9[6],xmm0[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm13[1],xmm15[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm24
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4,5],xmm9[6],xmm4[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm4, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm0[1],xmm4[2,3,4,5],xmm0[6],xmm4[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [2,5,2,5,2,5,2,5]
-; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm31
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm16
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5],xmm12[6],xmm14[7]
-; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm20, %zmm14
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6],xmm14[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm17, %zmm14
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm28
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0,1,2,3],xmm12[4],xmm14[5],xmm12[6],xmm14[7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [10,3,6,15,12,13,6,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,5,8,12,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm0
-; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm20, %zmm12
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm12, %ymm0
-; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm14, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm19 & (zmm16 ^ zmm9))
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm24 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3],xmm8[4],xmm3[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,10,11,8,9,6,7,4,5,u,u]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,7,11,14]
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3,4,5,6],xmm8[7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm25
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm19, %zmm3
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [4,5,6,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm25
+; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2,3,4,5],xmm3[6],xmm8[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,5,9,0,12,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vporq %ymm8, %ymm3, %ymm22
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm10
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0,1,2,3],xmm10[4],xmm15[5],xmm10[6],xmm15[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,0,1,14,15,12,13,10,11,8,9,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,28,29]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm27
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6],xmm12[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,12,13,10,11,8,9,6,7,u,u]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,8,11,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5,6],xmm12[7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm28
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm10
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm21, %zmm28
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,6,9,0,13,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm10, %zmm10
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,4,5,10,11,16,17],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5],xmm13[6],xmm12[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[4,5,2,3,0,1,14,15,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vporq %ymm10, %ymm12, %ymm23
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4,5],xmm10[6],xmm12[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm30[0,1,1,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,30,31]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm10, %zmm16
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm7[2],ymm6[3,4,5],ymm7[6],ymm6[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm15[4],xmm10[5],xmm15[6],xmm10[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [1,0,0,0,5,8,12,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm13, %zmm13
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[2,3,16,17,22,23,24,25,30,31],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm18
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm29, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm21, %zmm18
; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm16 {%k1}
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3,4,5],xmm3[6],xmm0[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm20
-; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3,4,5],ymm3[6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm12[4],xmm9[5],xmm12[6],xmm9[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3,4,5],xmm12[6],xmm14[7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [2,11,2,11,12,5,8,9]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm14, %zmm14
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3,4,5,6,7],ymm14[8,9,10],ymm9[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm19 & (zmm20 ^ zmm2))
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm20 {%k1}
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm12
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,5,10,11,0,1,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm10[1],xmm13[2],xmm10[3],xmm13[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm10, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm13
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm13[1],xmm10[2,3,4,5],xmm13[6],xmm10[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,4,5,2,3,0,1,14,15,12,13,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm30[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,24,25]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[12,13,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm21
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm17 & (zmm21 ^ zmm1))
+; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,0,1,14,15,12,13,10,11,8,9]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm9
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2,3,4,5],xmm12[6],xmm9[7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,11,2,11,12,5,8,9]
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,22,23,28,29,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3,4,5,6,7],ymm12[8,9,10],ymm10[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
+; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm9, %zmm0, %zmm21 {%k1}
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm10
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,7,10,14,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm14
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3],xmm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm10
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm15, %xmm11
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm19 & (zmm10 ^ zmm12))
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,0,0,0,6,9,13,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,3,7,10,14,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm10, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3,4],ymm8[5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1],xmm8[2],xmm3[3],xmm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm14, %xmm9
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm17 & (zmm3 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm15[3],ymm1[4,5],ymm15[6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3,4,5],xmm0[6],xmm8[7]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm11, %ymm11
-; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm18, %zmm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,0,0,0,6,9,13,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm9, %zmm9
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3,4,5,6,7],ymm9[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,2,3,0,1,14,15,12,13],zero,zero
+; AVX512DQ-FCP-NEXT: vpor %ymm9, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,0,0,3,7,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm11
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15]
-; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm13, %xmm11
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,4,7,11,14,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm11, %zmm11
+; AVX512DQ-FCP-NEXT: vpermd %ymm30, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,0,1,6,7,8,9,14,15,0,1,6,7,8,9,16,17,16,17,22,23,24,25,30,31,16,17,22,23,24,25]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm19, %zmm9
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15]
+; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm11, %xmm8
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm19
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm8
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7]
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,4,7,0,0,4,7,0]
-; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpermd %zmm22, %zmm17, %zmm9
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5,6,7],ymm9[8,9,10,11,12],ymm2[13,14,15]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm13
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm13
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0],xmm9[1],xmm13[2],xmm9[3],xmm13[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [3,0,0,0,6,10,13,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm13, %zmm13
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,4,7,11,14,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1],ymm8[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3,4,5],xmm9[6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [0,4,7,0,0,4,7,0]
+; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpermd %ymm30, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm20, %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [4,5,2,3,4,5,10,11,12,13,2,3,4,5,10,11,20,21,18,19,20,21,26,27,28,29,18,19,20,21,26,27]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5,6,7],ymm12[8,9,10,11,12],ymm9[13,14,15]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm14[4],xmm11[5],xmm14[5],xmm11[6],xmm14[6],xmm11[7],xmm14[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7],ymm13[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm12 ^ (zmm19 & (zmm11 ^ zmm12))
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1}
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [3,0,0,0,6,10,13,0]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2],xmm11[3],xmm13[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,1,18,19,20,21,26,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
+; AVX512DQ-FCP-NEXT: vpor %ymm12, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm19 ^ (zmm17 & (zmm8 ^ zmm19))
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1}
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,4,8,11,15,0,0,0]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm9
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,10,11,6,7,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,4,6,7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,6,7,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,10,3,14,7,10,3]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm5
-; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm4, %zmm4
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[4,5,10,11,u,u,u,u,u,u,u,u,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm19 & (zmm0 ^ zmm2))
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,6,7,4,5,2,3,0,1,14,15]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm9 ^ (zmm17 & (zmm2 ^ zmm9))
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm1 & (zmm9 ^ zmm23))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm21 ^ (mem & (zmm24 ^ zmm21))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm24))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm24))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm22 ^ (mem & (zmm27 ^ zmm22))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm0, %zmm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm27))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm17 & (zmm16 ^ zmm23))
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm18, %zmm0, %zmm16 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%r9)
-; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -12052,9 +12063,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vmovdqa 656(%rdi), %xmm13
-; AVX2-NEXT: vmovdqa 640(%rdi), %xmm14
-; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0,1,2,3,4,5],xmm13[6],xmm14[7]
+; AVX2-NEXT: vmovdqa 640(%rdi), %xmm13
+; AVX2-NEXT: vmovdqa 656(%rdi), %xmm14
+; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,5],xmm14[6],xmm13[7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,7,6]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -12082,11 +12093,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3],xmm1[4],xmm4[5,6,7]
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vmovdqa 432(%rdi), %xmm4
+; AVX2-NEXT: vmovdqa 416(%rdi), %xmm4
; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovdqa 416(%rdi), %xmm5
+; AVX2-NEXT: vmovdqa 432(%rdi), %xmm5
; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm4[6],xmm5[7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5],xmm5[6],xmm4[7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,7,6]
; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -12114,9 +12125,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
-; AVX2-NEXT: vmovdqa 880(%rdi), %xmm1
-; AVX2-NEXT: vmovdqa 864(%rdi), %xmm0
-; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
+; AVX2-NEXT: vmovdqa 864(%rdi), %xmm1
+; AVX2-NEXT: vmovdqa 880(%rdi), %xmm0
+; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5],xmm0[6],xmm1[7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
@@ -12142,9 +12153,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3],xmm5[4],xmm6[5,6,7]
; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm3
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5
-; AVX2-NEXT: vmovdqa 208(%rdi), %xmm15
-; AVX2-NEXT: vmovdqa 192(%rdi), %xmm3
-; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5],xmm15[6],xmm3[7]
+; AVX2-NEXT: vmovdqa 192(%rdi), %xmm15
+; AVX2-NEXT: vmovdqa 208(%rdi), %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3,4,5],xmm3[6],xmm15[7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,7,6]
; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
@@ -12178,7 +12189,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm5
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3],xmm5[4],xmm2[5],xmm5[6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm13[1],xmm14[2,3,4,5,6,7]
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -12207,7 +12218,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: # ymm9 = ymm8[0,1],mem[2,3],ymm8[4,5],mem[6,7]
; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10
; AVX2-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3],xmm10[4],xmm9[5],xmm10[6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX2-NEXT: vpshufb %xmm5, %xmm9, %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
@@ -12236,8 +12247,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vextracti128 $1, %ymm7, %xmm9
; AVX2-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3],xmm9[4],xmm7[5],xmm9[6,7]
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3,4,5,6,7]
; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm7
; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3]
@@ -12262,7 +12273,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm7
; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3],xmm7[4],xmm4[5],xmm7[6,7]
; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm15[1],xmm3[2,3,4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,7]
@@ -12797,7 +12808,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm12 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm4
; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
@@ -12913,11 +12924,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT: vmovdqa 656(%rdi), %xmm3
+; AVX2-FP-NEXT: vmovdqa 640(%rdi), %xmm3
; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vmovdqa 640(%rdi), %xmm2
+; AVX2-FP-NEXT: vmovdqa 656(%rdi), %xmm2
; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6],xmm2[7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6],xmm3[7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,12,13]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -12926,9 +12937,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm13 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %xmm13, %xmm4, %xmm8
-; AVX2-FP-NEXT: vmovd {{.*#+}} xmm4 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
@@ -12945,11 +12956,11 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1],xmm1[2],xmm8[3],xmm1[4],xmm8[5,6,7]
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm9
+; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm9
; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm8
+; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm8
; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6],xmm8[7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,5],xmm8[6],xmm9[7]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm8, %xmm8
; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15]
@@ -12974,9 +12985,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm11
-; AVX2-FP-NEXT: vmovdqa 880(%rdi), %xmm1
-; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm0
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
+; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm1
+; AVX2-FP-NEXT: vmovdqa 880(%rdi), %xmm0
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm15 = xmm1[0,1,2,3,4,5],xmm0[6],xmm1[7]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm15, %xmm15
; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5,6,7],ymm11[8,9,10,11,12],ymm15[13,14,15]
@@ -12998,9 +13009,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm11
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm11[0,1],xmm8[2],xmm11[3],xmm8[4],xmm11[5,6,7]
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm8, %xmm8
-; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm14
-; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm5
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm5[0,1,2,3,4,5],xmm14[6],xmm5[7]
+; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm14
+; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm5
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5],xmm5[6],xmm14[7]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm13, %xmm3
; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
@@ -13032,8 +13043,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3],xmm4[4],xmm3[5],xmm4[6,7]
; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FP-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload
-; AVX2-FP-NEXT: # xmm7 = xmm4[0],mem[1],xmm4[2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload
+; AVX2-FP-NEXT: # xmm7 = mem[0],xmm4[1],mem[2,3,4,5,6,7]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,10,11,8,9,6,7,4,5,4,5,4,5,4,5]
; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
@@ -13045,7 +13056,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm7 # 32-byte Folded Reload
; AVX2-FP-NEXT: # ymm7 = mem[0,1,2],ymm3[3],mem[4,5],ymm3[6],mem[7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm9
-; AVX2-FP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm9, %xmm9
; AVX2-FP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
@@ -13063,7 +13074,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: # ymm9 = ymm9[0,1],mem[2,3],ymm9[4,5],mem[6,7]
; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm12
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm12[0,1,2],xmm9[3],xmm12[4],xmm9[5],xmm12[6,7]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX2-FP-NEXT: vpshufb %xmm4, %xmm9, %xmm1
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
@@ -13091,8 +13102,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX2-FP-NEXT: vpblendw $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX2-FP-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw $253, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX2-FP-NEXT: # xmm9 = mem[0],xmm9[1],mem[2,3,4,5,6,7]
; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm8
; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX2-FP-NEXT: vpshufb %xmm6, %xmm9, %xmm9
@@ -13115,7 +13126,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm4
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0],xmm5[1],xmm14[2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3,4,5,6,7]
; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -13638,7 +13649,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm10 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm3
; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
@@ -13693,7 +13704,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm3
; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
@@ -13755,9 +13766,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,4,7,3,6,0,0,0]
@@ -13783,7 +13794,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm13
; AVX2-FCP-NEXT: vmovdqa %xmm4, %xmm6
-; AVX2-FCP-NEXT: vmovd {{.*#+}} xmm2 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm12
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
@@ -13838,8 +13849,8 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
@@ -13862,7 +13873,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX2-FCP-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm7 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7]
; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10
-; AVX2-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm10
; AVX2-FCP-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
@@ -13997,48 +14008,47 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-LABEL: load_i16_stride7_vf64:
; AVX512: # %bb.0:
-; AVX512-NEXT: subq $1864, %rsp # imm = 0x748
-; AVX512-NEXT: vmovdqa 480(%rdi), %ymm1
+; AVX512-NEXT: subq $1800, %rsp # imm = 0x708
+; AVX512-NEXT: vmovdqa 480(%rdi), %ymm11
; AVX512-NEXT: vmovdqa 448(%rdi), %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm11[2],ymm2[3,4,5],ymm11[6],ymm2[7]
; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm18
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2
-; AVX512-NEXT: vmovdqa 512(%rdi), %ymm13
+; AVX512-NEXT: vmovdqa 512(%rdi), %ymm12
; AVX512-NEXT: vmovdqa 544(%rdi), %ymm10
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm13[1],ymm10[2,3,4],ymm13[5],ymm10[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0],ymm12[1],ymm10[2,3,4],ymm12[5],ymm10[6,7]
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3
; AVX512-NEXT: vpor %ymm3, %ymm2, %ymm2
; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vpbroadcastw 700(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa64 672(%rdi), %xmm22
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm22[0,1,0,3]
+; AVX512-NEXT: vmovdqa64 672(%rdi), %xmm21
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm21[0,1,0,3]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
; AVX512-NEXT: vmovdqa (%rdi), %ymm8
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX512-NEXT: vmovdqa 64(%rdi), %ymm14
-; AVX512-NEXT: vmovdqa 96(%rdi), %ymm15
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3,4],ymm14[5],ymm15[6,7]
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm6
+; AVX512-NEXT: vmovdqa 64(%rdi), %ymm9
+; AVX512-NEXT: vmovdqa 96(%rdi), %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7]
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm5[2],ymm8[3,4,5],ymm5[6],ymm8[7]
-; AVX512-NEXT: vmovdqa %ymm5, %ymm6
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm6[2],ymm8[3,4,5],ymm6[6],ymm8[7]
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 128(%rdi), %ymm9
-; AVX512-NEXT: vmovdqa 160(%rdi), %ymm11
-; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm21
-; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm21[0,1,0,2]
+; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2
+; AVX512-NEXT: vmovdqa 160(%rdi), %ymm14
+; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm19
+; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm19[0,1,0,2]
+; AVX512-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpbroadcastw 252(%rdi), %xmm0
; AVX512-NEXT: vmovdqa 224(%rdi), %xmm4
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,0,3]
@@ -14046,81 +14056,85 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7]
+; AVX512-NEXT: vmovdqa %ymm7, %ymm1
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
; AVX512-NEXT: vmovdqa64 %ymm8, %ymm17
-; AVX512-NEXT: vmovdqa64 %ymm6, %ymm19
+; AVX512-NEXT: vmovdqa64 %ymm6, %ymm18
; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb %ymm6, %ymm5, %ymm5
; AVX512-NEXT: vpor %ymm0, %ymm5, %ymm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7]
-; AVX512-NEXT: vmovdqa64 %ymm11, %ymm20
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm14[2],ymm2[3,4,5],ymm14[6],ymm2[7]
+; AVX512-NEXT: vmovdqa64 %ymm14, %ymm20
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm23
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm5
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm15[0,1,1,3,4,5,5,7]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7]
; AVX512-NEXT: vmovdqa 240(%rdi), %xmm0
; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm4[1],xmm0[2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm23
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm24
; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vmovdqa 528(%rdi), %xmm7
-; AVX512-NEXT: vmovdqa %ymm10, %ymm12
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
+; AVX512-NEXT: vmovdqa %ymm10, %ymm13
+; AVX512-NEXT: vmovdqa %ymm12, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7]
; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15]
; AVX512-NEXT: vpshufb %ymm3, %ymm7, %ymm3
; AVX512-NEXT: vmovdqa64 %ymm16, %ymm10
-; AVX512-NEXT: vmovdqa64 %ymm18, %ymm11
+; AVX512-NEXT: vmovdqa %ymm11, %ymm12
; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7]
; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm6
-; AVX512-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 576(%rdi), %ymm0
+; AVX512-NEXT: vpor %ymm3, %ymm6, %ymm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa 576(%rdi), %ymm11
; AVX512-NEXT: vmovdqa 608(%rdi), %ymm6
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm6[2],ymm0[3,4,5],ymm6[6],ymm0[7]
-; AVX512-NEXT: vmovdqa64 %ymm6, %ymm26
-; AVX512-NEXT: vmovdqa64 %ymm0, %ymm30
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7]
+; AVX512-NEXT: vmovdqa64 %ymm6, %ymm22
; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm6
; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7]
; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX512-NEXT: vmovdqa64 640(%rdi), %ymm16
-; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
+; AVX512-NEXT: vpermq {{.*#+}} ymm25 = ymm16[0,1,0,2]
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm25[0,1,1,3,4,5,5,7]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5,6],ymm5[7]
-; AVX512-NEXT: vmovdqa 688(%rdi), %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm2[1],xmm3[2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa 688(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm3
+; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm3[1],xmm0[2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm21
; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7]
-; AVX512-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm14, %ymm22
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm26
+; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa %ymm9, %ymm15
+; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
-; AVX512-NEXT: vmovdqa64 %ymm17, %ymm14
+; AVX512-NEXT: vmovdqa64 %ymm17, %ymm2
; AVX512-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm19, %ymm1
-; AVX512-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm14[1],ymm1[2,3],ymm14[4],ymm1[5,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm18, %ymm1
+; AVX512-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7]
; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7
; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
@@ -14130,41 +14144,41 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpor %ymm6, %ymm8, %ymm6
; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vmovdqa64 %ymm20, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7]
-; AVX512-NEXT: vmovdqa64 %ymm9, %ymm19
+; AVX512-NEXT: vmovdqa64 %ymm23, %ymm6
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm0[3],ymm6[4,5],ymm0[6],ymm6[7]
; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm8
; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
; AVX512-NEXT: vpshufb %xmm6, %xmm8, %xmm8
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm21[0,1,1,2]
+; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm19[0,1,1,2]
; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
-; AVX512-NEXT: vmovdqa64 %xmm23, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm0
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[2,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
+; AVX512-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15]
; AVX512-NEXT: vpshufb %ymm7, %ymm8, %ymm7
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6,7]
-; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7]
+; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm5
; AVX512-NEXT: vpor %ymm7, %ymm5, %ymm5
; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm26, %ymm9
-; AVX512-NEXT: vmovdqa64 %ymm30, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7]
+; AVX512-NEXT: vmovdqa64 %ymm22, %ymm9
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7]
+; AVX512-NEXT: vmovdqa64 %ymm11, %ymm30
; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm7
; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7]
; AVX512-NEXT: vpshufb %xmm6, %xmm5, %xmm5
@@ -14173,68 +14187,71 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm11
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm28
; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm22, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2],ymm0[3],ymm15[4,5],ymm0[6],ymm15[7]
+; AVX512-NEXT: vmovdqa64 %ymm26, %ymm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0,1,2],ymm15[3],ymm0[4,5],ymm15[6],ymm0[7]
; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpshufb %ymm5, %ymm6, %ymm8
; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm7
-; AVX512-NEXT: vpor %ymm7, %ymm8, %ymm1
-; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3,4],ymm10[5],ymm11[6,7]
+; AVX512-NEXT: vpor %ymm7, %ymm8, %ymm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7]
; AVX512-NEXT: vpshufb %ymm6, %ymm7, %ymm6
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7]
; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm5
; AVX512-NEXT: vpor %ymm5, %ymm6, %ymm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vmovdqa64 %ymm20, %ymm15
-; AVX512-NEXT: vmovdqa64 %ymm19, %ymm10
+; AVX512-NEXT: vmovdqa64 %ymm23, %ymm10
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm21[0,1,1,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm19[0,1,1,3]
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm5
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm19[0,1,2,1,4,5,6,5]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,1,4,5,6,5]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
; AVX512-NEXT: vpbroadcastw 232(%rdi), %xmm6
-; AVX512-NEXT: vpsrlq $48, %xmm23, %xmm7
+; AVX512-NEXT: vpsrlq $48, %xmm24, %xmm7
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX512-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm26, %ymm12
-; AVX512-NEXT: vmovdqa64 %ymm30, %ymm13
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm22, %ymm13
+; AVX512-NEXT: vmovdqa64 %ymm30, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1
-; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm16[0,1,1,3]
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm16[0,1,2,1,4,5,6,5]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,1,4,5,6,5]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
; AVX512-NEXT: vpbroadcastw 680(%rdi), %xmm5
-; AVX512-NEXT: vpsrlq $48, %xmm3, %xmm6
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm2
+; AVX512-NEXT: vpsrlq $48, %xmm21, %xmm6
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm27
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm20, %ymm11
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm5
; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3],xmm1[4],xmm5[5,6,7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
@@ -14247,13 +14264,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15]
-; AVX512-NEXT: vmovdqa %xmm4, %xmm2
+; AVX512-NEXT: vmovdqa %xmm4, %xmm15
; AVX512-NEXT: vpsrld $16, %xmm4, %xmm8
-; AVX512-NEXT: vmovdqa64 %xmm23, %xmm4
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm4
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm5, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1],ymm13[2],ymm12[3,4],ymm13[5],ymm12[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm8
; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3],xmm5[4],xmm8[5,6,7]
; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1
@@ -14265,12 +14282,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,7,6]
; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15]
-; AVX512-NEXT: vmovdqa64 %xmm28, %xmm11
-; AVX512-NEXT: vpsrld $16, %xmm28, %xmm9
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
+; AVX512-NEXT: vmovdqa %xmm3, %xmm12
+; AVX512-NEXT: vpsrld $16, %xmm3, %xmm9
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm3
; AVX512-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4],xmm8[5],xmm9[6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7]
@@ -14281,28 +14299,28 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,4,7]
; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7],ymm8[8,9,10,11,12],ymm7[13,14,15]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm7, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,7,7]
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm18[0,1,2,1,4,5,6,5]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm25[0,1,2,1,4,5,6,5]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
; AVX512-NEXT: movw $992, %ax # imm = 0x3E0
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
+; AVX512-NEXT: vinserti32x4 $2, (%rsp), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm26, %ymm17
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm22, %ymm19
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
@@ -14312,20 +14330,21 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 704(%rdi), %ymm7
-; AVX512-NEXT: vmovdqa 736(%rdi), %ymm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm7[2,3],ymm5[4,5],ymm7[6,7]
+; AVX512-NEXT: vmovdqa 704(%rdi), %ymm6
+; AVX512-NEXT: vmovdqa 736(%rdi), %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm6[2,3],ymm2[4,5],ymm6[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm22
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
; AVX512-NEXT: vmovdqa 800(%rdi), %ymm3
; AVX512-NEXT: vmovdqa 768(%rdi), %ymm4
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7]
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm20
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm17
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
@@ -14334,150 +14353,150 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vmovdqa 832(%rdi), %ymm3
-; AVX512-NEXT: vmovdqa 864(%rdi), %ymm8
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm21
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vmovdqa 832(%rdi), %ymm9
+; AVX512-NEXT: vmovdqa 864(%rdi), %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm7, %ymm16
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,10,11,8,9,4,5,8,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm3
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7]
-; AVX512-NEXT: vmovdqa64 %ymm15, %ymm22
-; AVX512-NEXT: vmovdqa64 %ymm10, %ymm23
+; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[0,1,2,1,4,5,6,5]
+; AVX512-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm2 = mem[0,1,2,1,4,5,6,5]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 256(%rdi), %ymm2
-; AVX512-NEXT: vmovdqa 288(%rdi), %ymm15
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm2[2,3],ymm15[4,5],ymm2[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm2, %ymm25
+; AVX512-NEXT: vmovdqa 256(%rdi), %ymm3
+; AVX512-NEXT: vmovdqa 288(%rdi), %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
-; AVX512-NEXT: vmovdqa 352(%rdi), %ymm14
-; AVX512-NEXT: vmovdqa 320(%rdi), %ymm10
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm14[3],ymm10[4,5],ymm14[6],ymm10[7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6],ymm0[7,8,9,10,11,12,13],ymm13[14],ymm0[15]
+; AVX512-NEXT: vmovdqa 352(%rdi), %ymm15
+; AVX512-NEXT: vmovdqa 320(%rdi), %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6],ymm0[7,8,9,10,11,12,13],ymm12[14],ymm0[15]
; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,2,2,1]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4,5,6],xmm0[7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vmovdqa 384(%rdi), %ymm1
-; AVX512-NEXT: vmovdqa 416(%rdi), %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm12
+; AVX512-NEXT: vmovdqa 416(%rdi), %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512-NEXT: vpshufb %xmm7, %xmm12, %xmm7
+; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12
; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3]
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm25, %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm2[3],ymm15[4,5],ymm2[6],ymm15[7]
-; AVX512-NEXT: vmovdqa64 %ymm15, %ymm24
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm12
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2,3],xmm0[4],xmm12[5],xmm0[6],xmm12[7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15]
-; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
-; AVX512-NEXT: vpshufb %xmm13, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
-; AVX512-NEXT: vpshufb %ymm9, %ymm12, %ymm12
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3,4,5,6],xmm12[7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-NEXT: vmovdqa %ymm1, %ymm15
-; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm1[2,3],ymm3[4,5],ymm1[6,7]
-; AVX512-NEXT: vextracti32x4 $1, %ymm12, %xmm25
-; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm25[2,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4],xmm7[5],xmm0[6],xmm7[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm7[1,2,3,4,5,6],ymm12[7,8],ymm7[9,10,11,12,13,14],ymm12[15]
+; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
+; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
+; AVX512-NEXT: vpshufb %ymm8, %ymm7, %ymm7
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3,4,5,6],xmm7[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm18
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21
+; AVX512-NEXT: vextracti32x4 $1, %ymm7, %xmm25
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,10,11,10,11,6,7,8,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm7
+; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm25[2,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3]
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm11
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0,1,2,3],xmm0[4],xmm11[5],xmm0[6],xmm11[7]
-; AVX512-NEXT: vpshufb %xmm13, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %ymm20, %ymm6
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15]
-; AVX512-NEXT: vpshufb %ymm9, %ymm11, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm21, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm11
-; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3]
+; AVX512-NEXT: vmovdqa64 %ymm22, %ymm11
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm6[3],ymm11[4,5],ymm6[6],ymm11[7]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3],xmm0[4],xmm7[5],xmm0[6],xmm7[7]
+; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %ymm17, %ymm10
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm4[1],ymm10[2,3],ymm4[4],ymm10[5,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm7[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm12[0],ymm7[1,2,3,4,5,6],ymm12[7,8],ymm7[9,10,11,12,13,14],ymm12[15]
+; AVX512-NEXT: vpshufb %ymm8, %ymm7, %ymm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm16, %ymm8
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
+; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm14, %ymm21
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm5, %ymm22
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23]
-; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm11
+; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm2
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15]
-; AVX512-NEXT: vmovdqa64 %ymm24, %ymm14
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm14[2],ymm2[3,4,5],ymm14[6],ymm2[7]
-; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb %ymm12, %ymm11, %ymm11
-; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm0 & ~ymm20)
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7]
-; AVX512-NEXT: vmovdqa %ymm15, %ymm13
-; AVX512-NEXT: vextracti32x4 $1, %ymm0, %xmm25
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4,5,6,7,8],ymm0[9],ymm2[10,11,12,13,14,15]
+; AVX512-NEXT: vmovdqa %ymm3, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm14[2],ymm3[3,4,5],ymm14[6],ymm3[7]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4],xmm2[5],xmm7[6],xmm2[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb %ymm7, %ymm2, %ymm2
+; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm25 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ~ymm25)
+; AVX512-NEXT: vmovdqa64 %ymm18, %ymm12
+; AVX512-NEXT: vmovdqa64 %ymm21, %ymm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm12[3],ymm3[4,5],ymm12[6],ymm3[7]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm13
; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[0,1,2,1]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,7]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,7]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm17, %ymm16
+; AVX512-NEXT: vmovdqa %ymm4, %ymm10
; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7]
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm11
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7]
-; AVX512-NEXT: vpshufb %ymm12, %ymm1, %ymm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & ~ymm20)
+; AVX512-NEXT: vmovdqa %ymm11, %ymm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7]
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
+; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & ~ymm25)
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm11
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,5,5,7]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,7]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm22, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm20, %ymm0
; AVX512-NEXT: vmovdqa64 %ymm23, %ymm1
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -14485,39 +14504,40 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm1 = mem[0,1,2,0,4,5,6,4]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
; AVX512-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX512-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
-; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa %ymm2, %ymm15
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm14[3],ymm2[4,5],ymm14[6],ymm2[7]
+; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm31
+; AVX512-NEXT: vmovdqa %ymm14, %ymm13
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm14[3],ymm5[4,5],ymm14[6],ymm5[7]
+; AVX512-NEXT: vmovdqa64 %ymm5, %ymm18
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm3[2],ymm13[3,4,5],ymm3[6],ymm13[7]
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm19
-; AVX512-NEXT: vmovdqa64 %ymm13, %ymm26
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm11
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
-; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa %ymm10, %ymm2
-; AVX512-NEXT: vmovdqa64 %ymm21, %ymm10
-; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1],ymm2[2],ymm10[3,4],ymm2[5],ymm10[6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm12
-; AVX512-NEXT: vpor %ymm0, %ymm12, %ymm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm3[2],ymm12[3,4,5],ymm3[6],ymm12[7]
+; AVX512-NEXT: vmovdqa64 %ymm21, %ymm17
+; AVX512-NEXT: vmovdqa %ymm12, %ymm14
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %ymm22, %ymm11
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm15, %ymm21
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,1,2,0]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpshufb %ymm3, %ymm7, %ymm7
+; AVX512-NEXT: vpor %ymm7, %ymm0, %ymm0
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7],ymm7[8,9,10],ymm1[11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm0
; AVX512-NEXT: vmovdqa64 %ymm30, %ymm1
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
@@ -14525,164 +14545,162 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm16[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm1 = mem[0,1,2,0,4,5,6,4]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
; AVX512-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX512-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm25
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7]
-; AVX512-NEXT: vmovdqa64 %ymm5, %ymm24
-; AVX512-NEXT: vmovdqa64 %ymm7, %ymm23
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7]
+; AVX512-NEXT: vmovdqa64 %ymm6, %ymm23
+; AVX512-NEXT: vmovdqa64 %ymm4, %ymm24
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
-; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm6, %ymm21
-; AVX512-NEXT: vmovdqa64 %ymm4, %ymm22
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %ymm16, %ymm15
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm10, %ymm22
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,0]
-; AVX512-NEXT: vpshufb %ymm13, %ymm1, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7]
-; AVX512-NEXT: vmovdqa %ymm8, %ymm5
-; AVX512-NEXT: vmovdqa %ymm9, %ymm4
-; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
+; AVX512-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7]
+; AVX512-NEXT: vmovdqa %ymm9, %ymm5
+; AVX512-NEXT: vmovdqa %ymm8, %ymm4
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm7[4],xmm2[5],xmm7[6],xmm2[7]
; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4]
-; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3,4,5,6,7],ymm1[8,9,10],ymm11[11,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512-NEXT: vmovdqa64 %ymm16, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm31
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm29
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm30
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
-; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm11
+; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm2
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6],ymm11[7,8,9,10,11,12,13],ymm0[14],ymm11[15]
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
-; AVX512-NEXT: vmovdqa64 %ymm20, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm6, %ymm28
-; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6],ymm2[7,8,9,10,11,12,13],ymm0[14],ymm2[15]
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
+; AVX512-NEXT: vmovdqa64 %ymm16, %ymm1
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm26, %ymm7
-; AVX512-NEXT: vmovdqa64 %ymm19, %ymm8
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm11
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
-; AVX512-NEXT: vmovdqa %ymm10, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm2[2,3],ymm10[4,5],ymm2[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm2, %ymm17
-; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm10[0,1,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3],ymm11[4,5,6,7,8,9,10],ymm13[11],ymm11[12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0],ymm15[1],ymm14[2,3],ymm15[4],ymm14[5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm15, %ymm26
-; AVX512-NEXT: vmovdqa64 %ymm14, %ymm27
-; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm15
-; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
-; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %ymm17, %ymm8
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm8[3],ymm12[4,5],ymm8[6],ymm12[7]
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
+; AVX512-NEXT: vmovdqa64 %ymm21, %ymm9
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
+; AVX512-NEXT: vmovdqa %ymm11, %ymm12
+; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm21[0,1,0,1]
+; AVX512-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3],ymm2[4,5,6,7,8,9,10],ymm7[11],ymm2[12,13,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm18, %ymm10
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm11
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm11[1],xmm7[2,3,4,5],xmm11[6],xmm7[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
+; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm1, %ymm11, %ymm11
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
-; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm13
-; AVX512-NEXT: vpor %ymm11, %ymm13, %ymm11
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
+; AVX512-NEXT: vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-NEXT: vmovdqa64 %xmm11, %xmm26
+; AVX512-NEXT: vpor %ymm2, %ymm7, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm10[2],ymm15[3,4,5],ymm10[6],ymm15[7]
-; AVX512-NEXT: vmovdqa64 %ymm10, %ymm29
-; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm2[2],ymm11[3,4,5],ymm2[6],ymm11[7]
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm28
+; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm2
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6],ymm3[7,8,9,10,11,12,13],ymm0[14],ymm3[15]
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm10, %ymm30
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm19 & (zmm3 ^ zmm0))
-; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
-; AVX512-NEXT: vmovdqa %ymm4, %ymm14
-; AVX512-NEXT: vmovdqa64 %ymm5, %ymm18
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
-; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %ymm21, %ymm12
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6],ymm2[7,8,9,10,11,12,13],ymm0[14],ymm2[15]
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
+; AVX512-NEXT: vmovdqa64 %ymm17, %ymm2
+; AVX512-NEXT: vmovdqa64 %ymm18, %ymm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm21 & (zmm27 ^ zmm0))
+; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
+; AVX512-NEXT: vmovdqa64 %ymm4, %ymm20
+; AVX512-NEXT: vmovdqa64 %ymm5, %ymm27
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
+; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa64 %ymm22, %ymm4
-; AVX512-NEXT: vmovdqu64 %ymm22, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm21[0,1,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3],ymm3[4,5,6,7,8,9,10],ymm11[11],ymm3[12,13,14,15]
-; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm1
-; AVX512-NEXT: vmovdqa64 %ymm24, %ymm6
-; AVX512-NEXT: vmovdqa64 %ymm23, %ymm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm11[1],xmm3[2,3,4,5],xmm11[6],xmm3[7]
-; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm15[0,1,0,1]
+; AVX512-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7,8,9,10],ymm3[11],ymm2[12,13,14,15]
+; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX512-NEXT: vmovdqa64 %ymm23, %ymm6
+; AVX512-NEXT: vmovdqa64 %ymm24, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm6[1],ymm5[2,3],ymm6[4],ymm5[5,6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm7[1],xmm3[2,3,4,5],xmm7[6],xmm3[7]
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm1
+; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-NEXT: vpor %ymm2, %ymm1, %ymm1
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7]
-; AVX512-NEXT: vmovdqa64 %ymm9, %ymm24
-; AVX512-NEXT: vmovdqa64 %ymm17, %ymm23
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm12[3],ymm9[4,5],ymm12[6],ymm9[7]
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm8, %ymm17
-; AVX512-NEXT: vmovdqa64 %ymm7, %ymm21
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0],ymm14[1],ymm8[2,3],ymm14[4],ymm8[5,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm14, %ymm22
+; AVX512-NEXT: vmovdqa64 %ymm8, %ymm26
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
-; AVX512-NEXT: vmovdqa64 %ymm26, %ymm2
-; AVX512-NEXT: vmovdqa64 %ymm27, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm13, %ymm23
+; AVX512-NEXT: vmovdqa64 %ymm10, %ymm24
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128]
; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
-; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
+; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
-; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa64 %xmm7, %xmm22
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
+; AVX512-NEXT: vpshufb %xmm13, %xmm2, %xmm2
; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm31, %ymm10
-; AVX512-NEXT: vmovdqa64 %ymm16, %ymm8
+; AVX512-NEXT: vmovdqa64 %ymm30, %ymm10
+; AVX512-NEXT: vmovdqa64 %ymm29, %ymm8
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7]
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
-; AVX512-NEXT: vmovdqa64 %ymm28, %ymm9
-; AVX512-NEXT: vmovdqa64 %ymm20, %ymm7
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm16, %ymm9
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm14[2,3],ymm9[4,5],ymm14[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
@@ -14691,40 +14709,38 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm4[3],ymm12[4,5],ymm4[6],ymm12[7]
-; AVX512-NEXT: vmovdqa64 %ymm12, %ymm20
+; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7]
+; AVX512-NEXT: vmovdqa64 %ymm4, %ymm19
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm18, %ymm12
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm14[1],ymm12[2,3],ymm14[4],ymm12[5,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm20, %ymm15
+; AVX512-NEXT: vmovdqa64 %ymm27, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5,6,7]
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7]
-; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm5, %ymm28
-; AVX512-NEXT: vmovdqa64 %ymm6, %ymm31
-; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2],xmm3[3],xmm11[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm4
-; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm5, %ymm29
+; AVX512-NEXT: vmovdqa64 %ymm6, %ymm30
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0],xmm3[1],xmm7[2],xmm3[3],xmm7[4,5,6,7]
+; AVX512-NEXT: vpshufb %xmm13, %xmm3, %xmm3
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
; AVX512-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18
-; AVX512-NEXT: vmovdqa64 %ymm29, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm1[3],ymm15[4,5],ymm1[6],ymm15[7]
-; AVX512-NEXT: vmovdqa64 %ymm29, %ymm16
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm20
+; AVX512-NEXT: vmovdqa64 %ymm28, %ymm13
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
+; AVX512-NEXT: vmovdqa64 %ymm11, %ymm28
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7,8],ymm0[9,10,11,12,13,14],ymm1[15]
; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm30, %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm13, %ymm29
-; AVX512-NEXT: vmovdqa64 %ymm30, %ymm13
+; AVX512-NEXT: vmovdqa64 %ymm17, %ymm7
+; AVX512-NEXT: vmovdqa64 %ymm18, %ymm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm7[2,3],ymm3[4,5],ymm7[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
@@ -14732,93 +14748,93 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm19 & (zmm30 ^ zmm0))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm21 & (zmm16 ^ zmm0))
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6,7]
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm14[3],ymm9[4,5],ymm14[6],ymm9[7]
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
-; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
+; AVX512-NEXT: vpshufb %ymm10, %ymm0, %ymm0
; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm21, %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm22, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm26, %ymm1
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm26, %ymm1
-; AVX512-NEXT: vmovdqa64 %ymm27, %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm23, %ymm1
+; AVX512-NEXT: vmovdqa64 %ymm24, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
+; AVX512-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm1 # 32-byte Folded Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 # 32-byte Folded Reload
; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqa64 %ymm24, %ymm5
-; AVX512-NEXT: vmovdqa64 %ymm23, %ymm7
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2],ymm7[3,4,5],ymm5[6],ymm7[7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm5[0,1,3,1]
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,1,3,1]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
-; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm7
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4,5,6,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
-; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm8
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3,4,5,6,7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
+; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm16, %ymm6
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[2,3,4,5,6,7,8],ymm8[9],ymm6[10,11,12,13,14,15]
-; AVX512-NEXT: vpshufb %ymm9, %ymm6, %ymm6
-; AVX512-NEXT: vmovdqa64 %ymm29, %ymm8
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7]
-; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm28, %ymm6
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3,4,5,6,7,8],ymm9[9],ymm6[10,11,12,13,14,15]
+; AVX512-NEXT: vpshufb %ymm10, %ymm6, %ymm6
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7]
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3,4,5,6,7]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7]
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm19 & (zmm13 ^ zmm6))
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
-; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm8
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2],xmm6[3],xmm8[4,5,6,7]
-; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm6
-; AVX512-NEXT: vmovdqa64 %ymm28, %ymm7
-; AVX512-NEXT: vmovdqa64 %ymm31, %ymm8
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm21 & (zmm13 ^ zmm6))
+; AVX512-NEXT: vmovdqa64 %ymm27, %ymm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7]
+; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5,6,7]
+; AVX512-NEXT: vpshufb %xmm8, %xmm6, %xmm6
+; AVX512-NEXT: vmovdqa64 %ymm29, %ymm3
+; AVX512-NEXT: vmovdqa64 %ymm30, %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm7[2],ymm3[3,4],ymm7[5],ymm3[6,7]
; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm8
; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,3,4,5,6,7]
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-NEXT: vmovdqa64 %ymm20, %ymm8
-; AVX512-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm8 = mem[0,1],ymm8[2],mem[3,4,5],ymm8[6],mem[7]
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm8
+; AVX512-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm8 = ymm8[0,1],mem[2],ymm8[3,4,5],mem[6],ymm8[7]
; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm3 = mem ^ (zmm9 & (zmm3 ^ mem))
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm10 # 64-byte Folded Reload
; AVX512-NEXT: # zmm10 = mem ^ (zmm9 & (zmm10 ^ mem))
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm12 = mem ^ (zmm9 & (zmm12 ^ mem))
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm1 # 64-byte Folded Reload
; AVX512-NEXT: # zmm1 = zmm1 ^ (zmm9 & (zmm1 ^ mem))
-; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm3 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ mem))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm9 & (zmm2 ^ zmm10))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm9 & (zmm4 ^ zmm12))
+; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm26 = zmm26 ^ (zmm9 & (zmm26 ^ mem))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm9 & (zmm2 ^ zmm3))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm9 & (zmm4 ^ zmm10))
; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,3,1]
; AVX512-NEXT: vpshufb %ymm5, %ymm8, %ymm5
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm7[1,2],ymm5[3,4,5,6,7]
@@ -14830,55 +14846,53 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload
; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
-; AVX512-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm14 # 32-byte Folded Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload
; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload
-; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm23
+; AVX512-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm23
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm5
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm12 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm12 = zmm12 ^ (zmm19 & (zmm12 ^ mem))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm3 = zmm3 ^ (zmm21 & (zmm3 ^ mem))
; AVX512-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovdqa32 %zmm6, %zmm12 {%k1}
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm6 = zmm6 ^ (zmm19 & (zmm6 ^ mem))
-; AVX512-NEXT: vmovdqa32 %zmm7, %zmm6 {%k1}
-; AVX512-NEXT: vmovdqa64 %zmm6, %zmm7
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm6 = zmm6 ^ (zmm19 & (zmm6 ^ mem))
-; AVX512-NEXT: vmovdqa32 %zmm8, %zmm6 {%k1}
-; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8
-; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm25 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm25 = zmm25 ^ (zmm19 & (zmm25 ^ mem))
+; AVX512-NEXT: vmovdqa32 %zmm6, %zmm3 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm3 = zmm3 ^ (zmm21 & (zmm3 ^ mem))
+; AVX512-NEXT: vmovdqa32 %zmm7, %zmm3 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm31 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm31 = zmm31 ^ (zmm21 & (zmm31 ^ mem))
+; AVX512-NEXT: vmovdqa32 %zmm8, %zmm31 {%k1}
+; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm25 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm25 = zmm25 ^ (zmm21 & (zmm25 ^ mem))
; AVX512-NEXT: vmovdqa32 %zmm9, %zmm25 {%k1}
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm6 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm6 = mem ^ (zmm19 & (zmm6 ^ mem))
-; AVX512-NEXT: vmovdqa32 %zmm10, %zmm6 {%k1}
-; AVX512-NEXT: vmovdqa64 %zmm3, (%rsi)
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm3 = mem ^ (zmm21 & (zmm3 ^ mem))
+; AVX512-NEXT: vmovdqa32 %zmm10, %zmm3 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm26, (%rsi)
; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rsi)
; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rdx)
; AVX512-NEXT: vmovdqa64 %zmm2, (%rdx)
; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rcx)
-; AVX512-NEXT: vmovdqa64 %zmm12, (%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm6, (%rcx)
; AVX512-NEXT: vmovdqa64 %zmm25, 64(%r8)
-; AVX512-NEXT: vmovdqa64 %zmm8, (%r8)
-; AVX512-NEXT: vmovdqa64 %zmm6, (%r9)
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vmovdqa32 %zmm14, %zmm2 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm31, (%r8)
+; AVX512-NEXT: vmovdqa64 %zmm3, (%r9)
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm1 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm1 = mem ^ (zmm19 & (zmm1 ^ mem))
-; AVX512-NEXT: vmovdqa32 %zmm15, %zmm1 {%k1}
-; AVX512-NEXT: vmovdqa64 %zmm2, 64(%r9)
+; AVX512-NEXT: vmovdqa32 %zmm14, %zmm1 {%k1}
+; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm12 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm12 = mem ^ (zmm21 & (zmm12 ^ mem))
+; AVX512-NEXT: vmovdqa32 %zmm15, %zmm12 {%k1}
+; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r9)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm19, %zmm11 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm11 = mem ^ (zmm19 & (zmm11 ^ mem))
-; AVX512-NEXT: vmovdqa64 %zmm30, %zmm1
+; AVX512-NEXT: vmovdqa64 %zmm12, (%rax)
+; AVX512-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm21, %zmm11 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm11 = mem ^ (zmm21 & (zmm11 ^ mem))
+; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1
; AVX512-NEXT: vmovdqa32 %zmm23, %zmm1 {%k1}
; AVX512-NEXT: vmovdqa32 %zmm0, %zmm11 {%k1}
; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rax)
@@ -14887,741 +14901,781 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa64 %zmm13, %zmm0
; AVX512-NEXT: vmovdqa32 %zmm5, %zmm0 {%k1}
; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax)
-; AVX512-NEXT: addq $1864, %rsp # imm = 0x748
+; AVX512-NEXT: addq $1800, %rsp # imm = 0x708
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride7_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $1800, %rsp # imm = 0x708
-; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm22
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm26
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,0,12,0,0,0]
-; AVX512-FCP-NEXT: vpermd %zmm26, %zmm1, %zmm0
+; AVX512-FCP-NEXT: subq $2120, %rsp # imm = 0x848
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm16
+; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm17
+; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2],ymm3[3,4,5],ymm1[6],ymm3[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm20
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
-; AVX512-FCP-NEXT: vpermd %zmm26, %zmm3, %zmm2
-; AVX512-FCP-NEXT: vpermd %zmm22, %zmm3, %zmm4
-; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm5
-; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm21
-; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm23
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm6
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vporq %ymm4, %ymm6, %ymm17
-; AVX512-FCP-NEXT: vmovdqa 672(%rdi), %xmm7
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm6
-; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm20
-; AVX512-FCP-NEXT: vpbroadcastw 700(%rdi), %xmm7
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm28 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,1,0,2]
+; AVX512-FCP-NEXT: vpermd %zmm17, %zmm3, %zmm4
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpor %ymm4, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 672(%rdi), %xmm5
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm4
+; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm19
+; AVX512-FCP-NEXT: vpbroadcastw 700(%rdi), %xmm5
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm21
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm21[0,1,0,2]
; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm8
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3
-; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm16
-; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm14
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm2
-; AVX512-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm3
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm15
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm18
-; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm19
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512-FCP-NEXT: vpermd %zmm16, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm13
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512-FCP-NEXT: vporq %ymm2, %ymm0, %ymm18
+; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm10
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm0
+; AVX512-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm1
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm12
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,0,12,0,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm16, %zmm1, %zmm0
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm2
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm2[2],ymm12[3,4,5],ymm2[6],ymm12[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm14[1],xmm15[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm7
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm10, %xmm10
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm13
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7]
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm0[2],ymm11[3,4,5],ymm0[6],ymm11[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm7[4],xmm5[5],xmm7[6],xmm5[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm8
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5,6],ymm9[7]
+; AVX512-FCP-NEXT: vmovdqa %xmm10, %xmm0
+; AVX512-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm10, %xmm10
+; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm4
+; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm4
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm20
; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT: vpermd %zmm22, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpor %ymm1, %ymm8, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm6
+; AVX512-FCP-NEXT: vpermd %zmm17, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpor %ymm1, %ymm6, %ymm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm4
-; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm4[2],ymm5[3,4,5],ymm4[6],ymm5[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm9[4],xmm8[5],xmm9[6],xmm8[7]
-; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %ymm25
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm25[0,1,0,2]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-FCP-NEXT: vmovdqa 688(%rdi), %xmm3
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm1[1],xmm3[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [2,6,9,0,13,0,0,0]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm11
-; AVX512-FCP-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
-; AVX512-FCP-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm11[1],ymm0[2,3],ymm11[4],ymm0[5,6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm29
-; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermd %zmm26, %zmm23, %zmm9
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8
-; AVX512-FCP-NEXT: vpor %ymm9, %ymm8, %ymm8
-; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm27
-; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm26
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2,3,4,5],xmm8[6],xmm9[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm18 = [2,5,2,5,2,5,2,5]
-; AVX512-FCP-NEXT: vpermd %ymm31, %ymm18, %ymm12
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm8[0,1,2,3,4,5,6],ymm12[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm7[0],xmm14[1],xmm7[1],xmm14[2],xmm7[2],xmm14[3],xmm7[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm20
-; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm19
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm14
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0],ymm13[1],ymm15[2,3],ymm13[4],ymm15[5,6,7]
-; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm2
+; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm6
+; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm6[2],ymm8[3,4,5],ymm6[6],ymm8[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm9[4],xmm1[5],xmm9[6],xmm1[7]
+; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %ymm23
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm23[0,1,0,2]
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT: vmovdqa 688(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm15
+; AVX512-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm15[1],xmm2[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm3
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [2,6,9,0,13,0,0,0]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3
+; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm14[1],xmm12[2,3,4,5],xmm14[6],xmm12[7]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm12, %ymm10
-; AVX512-FCP-NEXT: vpermd %zmm22, %zmm23, %zmm7
-; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm6
-; AVX512-FCP-NEXT: vpor %ymm6, %ymm10, %ymm6
-; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm23
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm4
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3,4,5],xmm6[6],xmm7[7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm5
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpermd %ymm25, %ymm18, %ymm7
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm18
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm9
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm6, %zmm1
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3,4,5],xmm5[6],xmm1[7]
+; AVX512-FCP-NEXT: vpermd %zmm16, %zmm19, %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm25
+; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm30
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpor %ymm5, %ymm1, %ymm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm11[1],ymm0[2,3,4],ymm11[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm9
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2],xmm6[3],xmm9[4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [10,3,6,15,12,13,6,15]
-; AVX512-FCP-NEXT: vpermd %zmm29, %zmm1, %zmm10
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm10
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm9, %ymm9
-; AVX512-FCP-NEXT: vpor %ymm10, %ymm9, %ymm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1],xmm9[2,3,4,5],xmm10[6],xmm9[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm14
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm31[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm13
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm10
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm24
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5],xmm1[6],xmm5[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm26
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm29
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm21[0,1,1,2]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15,16,17,18,19,20,21,22,23,16,17,18,19,16,17,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm27
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm22
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm5
+; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm0[2],ymm5[3,4,5],ymm0[6],ymm5[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm12[4],xmm5[5],xmm12[6],xmm5[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm16
+; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0],xmm8[1],xmm13[2],xmm8[3],xmm13[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm8
-; AVX512-FCP-NEXT: vpermd %zmm22, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm6
-; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [1,0,0,0,5,8,12,15]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm14
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm14
+; AVX512-FCP-NEXT: vpor %ymm5, %ymm14, %ymm0
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512-FCP-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm12
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm14[1],xmm5[2,3,4,5],xmm14[6],xmm5[7]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermd %zmm17, %zmm19, %zmm7
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vpor %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm30
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm19
+; AVX512-FCP-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm23[0,1,1,2]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm27
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm8
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm8
+; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm9[2],ymm8[3,4,5],ymm9[6],ymm8[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm23
+; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm20
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm8[4],xmm5[5],xmm8[6],xmm5[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28
+; AVX512-FCP-NEXT: vpermd %zmm28, %zmm21, %zmm8
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm8
+; AVX512-FCP-NEXT: vpor %ymm5, %ymm8, %ymm2
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0],ymm13[1],ymm3[2,3,4],ymm13[5],ymm3[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2],xmm5[3],xmm9[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [10,3,6,15,12,13,6,15]
+; AVX512-FCP-NEXT: vpermd %zmm25, %zmm3, %zmm9
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vpor %ymm5, %ymm9, %ymm2
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm2
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0],ymm2[1],ymm10[2,3],ymm2[4],ymm10[5,6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3,4,5],xmm9[6],xmm5[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm29[0,1,1,3]
+; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm9, %ymm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4,5,6],ymm14[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm14, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2],xmm1[3],xmm14[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpermd %zmm17, %zmm3, %zmm12
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm10
+; AVX512-FCP-NEXT: vpor %ymm1, %ymm10, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm25[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm10
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm7
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm10
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3,4,5],xmm10[6],xmm1[7]
+; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm10
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm19[0,1,1,3]
+; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm10, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2],xmm0[3],xmm3[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm6
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm5
-; AVX512-FCP-NEXT: vpsrlq $48, %xmm20, %xmm9
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm27
-; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm4
-; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [3,6,10,13,3,6,10,13]
-; AVX512-FCP-NEXT: vpermd %zmm21, %zmm24, %zmm6
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT: movw $992, %ax # imm = 0x3E0
-; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm28, %zmm4, %zmm17 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm5
+; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4
+; AVX512-FCP-NEXT: vpsrlq $48, %xmm22, %xmm9
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm22
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
; AVX512-FCP-NEXT: vpbroadcastw 680(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm9
-; AVX512-FCP-NEXT: vpsrlq $48, %xmm18, %xmm2
+; AVX512-FCP-NEXT: vpsrlq $48, %xmm27, %xmm2
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm26
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7]
-; AVX512-FCP-NEXT: vpermd %ymm31, %ymm2, %ymm0
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12]
-; AVX512-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm8
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5,6,7],ymm8[8,9,10,11,12],ymm6[13,14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm15
-; AVX512-FCP-NEXT: vpsrld $16, %xmm19, %xmm8
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm6, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm3
-; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm18
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm20
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4],xmm6[5],xmm8[6,7]
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm3 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm8
-; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm30
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,4,7,11,14]
-; AVX512-FCP-NEXT: vpermd %zmm30, %zmm6, %zmm13
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3,4,5,6],xmm13[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm13, %ymm7
-; AVX512-FCP-NEXT: vpermd %zmm4, %zmm24, %zmm13
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm11
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7]
+; AVX512-FCP-NEXT: vpermd %ymm29, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm30
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm0[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [2,5,9,12,2,5,9,12]
+; AVX512-FCP-NEXT: vpermd %zmm8, %zmm15, %zmm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7],ymm6[8,9,10,11,12],ymm5[13,14,15]
+; AVX512-FCP-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm29
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,6,10,13,3,6,10,13]
+; AVX512-FCP-NEXT: vpermd %zmm17, %zmm9, %zmm10
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT: movw $992, %ax # imm = 0x3E0
+; AVX512-FCP-NEXT: kmovw %eax, %k1
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 {%k1} # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm10
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3],xmm10[4],xmm5[5],xmm10[6,7]
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm12 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,4,7,11,14]
+; AVX512-FCP-NEXT: vpermd %zmm28, %zmm16, %zmm10
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm10
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3,4,5,6],xmm10[7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5
+; AVX512-FCP-NEXT: vpermd %zmm5, %zmm15, %zmm14
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm14
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [4,5,6,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm2
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm7
+; AVX512-FCP-NEXT: vpermd %zmm8, %zmm9, %zmm14
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm14, %ymm11
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm16 {%k1} # 16-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermd %ymm25, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm18 {%k1} # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1],ymm2[2,3],ymm13[4,5],ymm2[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2],xmm7[3],xmm11[4],xmm7[5],xmm11[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm16, %zmm11
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3,4,5,6],xmm1[7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm6
+; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm15, %zmm7
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2],ymm2[3],ymm13[4,5],ymm2[6],ymm13[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm19
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm11[0,1,2,3],xmm7[4],xmm11[5],xmm7[6],xmm11[7]
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm11 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
+; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,0,0,0,4,8,11,15]
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm16, %zmm14
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm14, %ymm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0,1,2],xmm7[3,4,5,6],xmm14[7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm6
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [2,6,9,13,2,6,9,13]
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm24, %zmm14
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm14, %ymm14
+; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm4, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm21
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm13
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5],xmm14[6],xmm13[7]
+; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm11
+; AVX512-FCP-NEXT: vpermd %zmm28, %zmm16, %zmm13
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm2
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3,4,5,6],xmm2[7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpermd %zmm5, %zmm24, %zmm2
; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpermd %zmm21, %zmm1, %zmm7
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512-FCP-NEXT: vpsrld $16, %xmm14, %xmm2
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,4,7,0,0,4,7,0]
-; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm31, %ymm12, %ymm0
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,6,9,13,2,6,9,13]
-; AVX512-FCP-NEXT: vpermd %zmm4, %zmm17, %zmm7
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm22 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpermd %zmm1, %zmm9, %zmm1
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm0
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm7
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm5 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm4
-; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm4[2,3],ymm15[4,5],ymm4[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3],xmm11[4],xmm13[5],xmm11[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm11, %xmm11
-; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm16
-; AVX512-FCP-NEXT: vpermd %zmm16, %zmm6, %zmm6
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2],xmm11[3,4,5,6],xmm6[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm10
-; AVX512-FCP-NEXT: vpermd %zmm10, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7]
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,0,0,0,4,8,11,15]
-; AVX512-FCP-NEXT: vpermd %zmm16, %zmm11, %zmm13
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3,4,5,6],xmm13[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm13[4,5,6,7]
-; AVX512-FCP-NEXT: vpermd %zmm10, %zmm17, %zmm13
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm13
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm13[6,7]
-; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2],ymm8[3],ymm7[4,5],ymm8[6],ymm7[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3],xmm1[4],xmm13[5],xmm1[6],xmm13[7]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpermd %zmm30, %zmm11, %zmm6
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm3
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm15[2],ymm4[3,4,5],ymm15[6],ymm4[7]
-; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm13
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,0,0,5,8,12,15]
-; AVX512-FCP-NEXT: vpermd %zmm16, %zmm3, %zmm6
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm6
-; AVX512-FCP-NEXT: vpor %ymm6, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpermd %zmm10, %zmm24, %zmm6
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7]
-; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm7[2],ymm8[3,4,5],ymm7[6],ymm8[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm10
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5],xmm6[6],xmm1[7]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm2
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm0
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpermd %zmm21, %zmm17, %zmm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpermd %zmm5, %zmm9, %zmm1
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7]
+; AVX512-FCP-NEXT: vpermd %ymm6, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512-FCP-NEXT: vpermd %zmm17, %zmm15, %zmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX512-FCP-NEXT: vpsrld $16, %xmm15, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm7
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm7
-; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,11,2,11,12,5,8,9]
-; AVX512-FCP-NEXT: vpermd %zmm16, %zmm3, %zmm6
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0]
+; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpermd %ymm30, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
+; AVX512-FCP-NEXT: vpermd %zmm8, %zmm24, %zmm3
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7],ymm3[8,9,10,11,12],ymm2[13,14,15]
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm4
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm27
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm8
+; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1],ymm8[2],ymm3[3,4,5],ymm8[6],ymm3[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1],xmm9[2,3,4,5],xmm5[6],xmm9[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,11,2,11,12,5,8,9]
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm10, %zmm11
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm11
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7],ymm11[8,9,10],ymm4[11,12,13,14,15]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpor %ymm6, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 864(%rdi), %ymm1
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vpor %ymm5, %ymm11, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 864(%rdi), %ymm4
; AVX512-FCP-NEXT: vmovdqa 832(%rdi), %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4,5],ymm1[6],ymm5[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm21
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm19
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm1
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm10[3],ymm8[4,5],ymm10[6],ymm8[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm23
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1],ymm4[2],ymm5[3,4,5],ymm4[6],ymm5[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm19
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm30
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm14
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm14[4],xmm11[5],xmm14[6],xmm11[7]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm9
+; AVX512-FCP-NEXT: vpermd %zmm28, %zmm10, %zmm10
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm10, %ymm10
+; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm4
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3,4,5],xmm11[6],xmm12[7]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm11
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7],ymm10[8,9,10],ymm9[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpor %ymm10, %ymm11, %ymm10
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpermd %zmm17, %zmm24, %zmm6
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm15[4],xmm7[4],xmm15[5],xmm7[5],xmm15[6],xmm7[6],xmm15[7],xmm7[7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2
-; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm25
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm6
+; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm24
; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,7,10,14,0,0,0]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermd %zmm24, %zmm5, %zmm3
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm29 & (zmm27 ^ zmm1))
-; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7]
-; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm12
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm22
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,3,7,10,14,0,0,0]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermd %zmm27, %zmm6, %zmm7
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm20 & (zmm22 ^ zmm1))
+; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm5
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3,4,5],xmm1[6],xmm7[7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm22
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [2,0,0,0,6,9,13,0]
-; AVX512-FCP-NEXT: vpermd %zmm16, %zmm20, %zmm13
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpor %ymm3, %ymm13, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,0,0,0,6,9,13,0]
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm3, %zmm10
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3,4,5,6,7],ymm10[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vpor %ymm7, %ymm10, %ymm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm13
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm3
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm10
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm7
; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermd %zmm25, %zmm5, %zmm3
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm29 & (zmm26 ^ zmm1))
-; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm5
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3,4,5],xmm1[6],xmm3[7]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpermd %zmm30, %zmm20, %zmm3
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0],ymm14[1],ymm10[2,3],ymm14[4],ymm10[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm20
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3,4,5],xmm6[6],xmm3[7]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm2
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermd %zmm29, %zmm6, %zmm6
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm20 & (zmm2 ^ zmm1))
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3,4,5],xmm1[6],xmm6[7]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpermd %zmm28, %zmm3, %zmm6
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm6
+; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm11
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0],ymm11[1],ymm3[2,3],ymm11[4],ymm3[5,6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5],xmm8[6],xmm7[7]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpor %ymm6, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm23
-; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm26
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm15, %ymm18
+; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm19
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm27
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm28
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm25
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm24
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,4,7,11,14,0,0,0]
-; AVX512-FCP-NEXT: vpermd %zmm24, %zmm21, %zmm2
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,4,7,11,14,0,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm27, %zmm1, %zmm7
+; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm30
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm21
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3,4,5,6,7]
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm19
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm9
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0],ymm2[1],ymm9[2,3,4],ymm2[5],ymm9[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm13
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1],xmm13[2],xmm6[3],xmm13[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm16
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm17
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3,4,5],xmm9[6],xmm0[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm7
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [3,0,0,0,6,10,13,0]
-; AVX512-FCP-NEXT: vpermd %zmm16, %zmm18, %zmm1
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vpor %ymm1, %ymm6, %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm31
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm7[2,3],ymm11[4,5],ymm7[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm6
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm6
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm17 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpermd %zmm25, %zmm21, %zmm1
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [3,0,0,0,6,10,13,0]
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm27, %zmm10
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm10
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7],ymm10[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm9, %xmm9
+; AVX512-FCP-NEXT: vpor %ymm10, %ymm9, %ymm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm26
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm14[2,3],ymm4[4,5],ymm14[6,7]
+; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm10
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm9
+; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm9, %xmm5
+; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512-FCP-NEXT: vpermd %zmm29, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm28
-; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm27
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3,4,5],xmm8[6],xmm0[7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm21 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vpermd %zmm30, %zmm18, %zmm8
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm12
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm8[1],xmm12[2],xmm8[3],xmm12[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm9
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7]
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpermd %zmm28, %zmm27, %zmm4
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0],ymm11[1],ymm3[2,3,4],ymm11[5],ymm3[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm22
+; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm21
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm3
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm18
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm23
+; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,8,11,15,0,0,0]
-; AVX512-FCP-NEXT: vpermd %zmm24, %zmm3, %zmm8
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2],xmm5[3],xmm7[4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm2[2],ymm9[3,4],ymm2[5],ymm9[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm7, %xmm8
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,3,1,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,10,3,14,7,10,3]
-; AVX512-FCP-NEXT: vpermd %zmm16, %zmm9, %zmm15
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,4,8,11,15,0,0,0]
+; AVX512-FCP-NEXT: vpermd %zmm30, %zmm2, %zmm3
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm5, %xmm8
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [0,1,10,3,14,7,10,3]
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm27, %zmm15
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm7[1,2],ymm15[3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm5[1,2],ymm15[3,4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm5, %xmm5
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7],ymm7[8,9,10],ymm5[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2],ymm2[3],ymm11[4,5],ymm2[6],ymm11[7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm13
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm13, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7],ymm5[8,9,10],ymm3[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm14[3],ymm10[4,5],ymm14[6],ymm10[7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm14
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm14, %xmm4
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpermd %zmm25, %zmm3, %zmm3
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm10
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0],xmm3[1],xmm10[2],xmm3[3],xmm10[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm11
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,1,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm9
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm9 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512-FCP-NEXT: vpermd %zmm29, %zmm2, %zmm2
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm12[1],ymm9[2,3,4],ymm12[5],ymm9[6,7]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm10 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm10 = mem ^ (zmm5 & (zmm10 ^ mem))
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm12 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm12 = mem ^ (zmm9 & (zmm12 ^ mem))
+; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm12 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm12 = mem ^ (zmm5 & (zmm12 ^ mem))
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm5 = zmm5 ^ (zmm11 & (zmm5 ^ mem))
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm14 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm14 = zmm14 ^ (zmm11 & (zmm14 ^ mem))
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm11 & (zmm15 ^ zmm10))
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (zmm11 & (zmm16 ^ zmm12))
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm6
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm6
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,1,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3]
+; AVX512-FCP-NEXT: vpermd %zmm28, %zmm27, %zmm11
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3,4,5,6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm13 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm13 = mem ^ (zmm9 & (zmm13 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm17 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm17 = zmm17 ^ (zmm9 & (zmm17 ^ mem))
-; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm21 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm21 = zmm21 ^ (zmm9 & (zmm21 ^ mem))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm9 & (zmm5 ^ zmm12))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm9 & (zmm11 ^ zmm13))
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm10[1,2],ymm8[3,4,5,6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3,4,5,6,7],ymm8[8,9,10],ymm3[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm19 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm19 = zmm19 ^ (zmm29 & (zmm19 ^ mem))
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm13 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm13 = zmm13 ^ (zmm20 & (zmm13 ^ mem))
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm0, %zmm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 32-byte Folded Reload
; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm31, %zmm0, %zmm16
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm10
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm9
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 32-byte Folded Reload
; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm19 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm29 & (zmm2 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm2 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm8 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm8 = zmm8 ^ (zmm29 & (zmm8 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm8 {%k1}
+; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm13 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm19 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm19 = zmm19 ^ (zmm20 & (zmm19 ^ mem))
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm18 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm19 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm18 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm18 = zmm18 ^ (zmm20 & (zmm18 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa32 %zmm6, %zmm18 {%k1}
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm6 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm6 = zmm6 ^ (zmm29 & (zmm6 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm6 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa32 %zmm13, %zmm9 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa32 %zmm15, %zmm10 {%k1}
-; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm1 = mem ^ (zmm29 & (zmm1 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa32 %zmm18, %zmm1 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm6 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm6 = zmm6 ^ (zmm20 & (zmm6 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm6 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm7 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm8 {%k1}
+; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm1 = mem ^ (zmm20 & (zmm1 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa32 %zmm9, %zmm1 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%rcx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 64(%r9)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%r9)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm1 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm1 = mem ^ (zmm29 & (zmm1 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa32 %zmm16, %zmm1 {%k1}
+; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm1 = mem ^ (zmm20 & (zmm1 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm1 {%k1}
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm4 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm4 = mem ^ (zmm29 & (zmm4 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1}
+; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm2 = mem ^ (zmm20 & (zmm2 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm2 {%k1}
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
-; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm29, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm0 = mem ^ (zmm29 & (zmm0 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa32 %zmm7, %zmm0 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
+; AVX512-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm20, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm0 = mem ^ (zmm20 & (zmm0 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm0 {%k1}
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512-FCP-NEXT: addq $1800, %rsp # imm = 0x708
+; AVX512-FCP-NEXT: addq $2120, %rsp # imm = 0x848
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -15629,128 +15683,130 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: subq $1592, %rsp # imm = 0x638
; AVX512DQ-NEXT: vmovdqa 480(%rdi), %ymm1
-; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm30
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm29
+; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm15
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm1[2],ymm15[3,4,5],ymm1[6],ymm15[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm23
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
-; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm12
-; AVX512DQ-NEXT: vmovdqa 544(%rdi), %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17
+; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm13
+; AVX512DQ-NEXT: vmovdqa 544(%rdi), %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,2,2,3]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3
-; AVX512DQ-NEXT: vporq %ymm3, %ymm2, %ymm26
+; AVX512DQ-NEXT: vporq %ymm3, %ymm2, %ymm29
; AVX512DQ-NEXT: vpbroadcastw 700(%rdi), %xmm3
-; AVX512DQ-NEXT: vmovdqa 672(%rdi), %xmm14
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,0,3]
+; AVX512DQ-NEXT: vmovdqa 672(%rdi), %xmm11
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[0,1,0,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm11
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm8
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm7
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm8[1],ymm7[2,3,4],ymm8[5],ymm7[6,7]
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm9
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm11[2],ymm6[3,4,5],ymm11[6],ymm6[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7]
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm7
; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm9
-; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm13
-; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm18
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm18[0,1,0,2]
+; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2
+; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm4
+; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm17
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm17[0,1,0,2]
+; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpbroadcastw 252(%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm15
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[0,1,0,3]
+; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm1
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,0,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
-; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm4
-; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm16
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm21
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7,8,9,10],ymm0[11],ymm3[12,13,14,15]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,8,9,6,7,4,5,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm11[3],ymm6[4,5],ymm11[6],ymm6[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm21
-; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm19
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm19
+; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm16
; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm5, %ymm5
; AVX512DQ-NEXT: vpor %ymm0, %ymm5, %ymm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm13[2],ymm9[3,4,5],ymm13[6],ymm9[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm18
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm20
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm5
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm10[0,1,1,3,4,5,5,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm15[1],xmm10[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm28
+; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm10
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,0,3,3,4,5,6,7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vmovdqa 528(%rdi), %xmm7
-; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7]
+; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm12
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6,7,8,9,10],ymm7[11],ymm8[12,13,14,15]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm7, %ymm3
-; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2],ymm4[3],ymm15[4,5],ymm4[6],ymm15[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm26
; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3,4,5],xmm7[6],xmm8[7]
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm6
; AVX512DQ-NEXT: vpor %ymm3, %ymm6, %ymm3
; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm2
-; AVX512DQ-NEXT: vmovdqa 608(%rdi), %ymm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm23
+; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm14
+; AVX512DQ-NEXT: vmovdqa 608(%rdi), %ymm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm2[2],ymm14[3,4,5],ymm2[6],ymm14[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm23
; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm6
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm6[4],xmm3[5],xmm6[6],xmm3[7]
; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm5
-; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %ymm22
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm22[0,1,0,2]
+; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %ymm25
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm25[0,1,0,2]
; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,1,1,3,4,5,5,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,5,6,8,9,10,11,12,13,13,14]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
; AVX512DQ-NEXT: vmovdqa 688(%rdi), %xmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm14[1],xmm0[2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm17
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm25
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm11[1],xmm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm1
-; AVX512DQ-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm14
-; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm2
+; AVX512DQ-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1],ymm9[2,3],ymm2[4,5],ymm9[6,7]
+; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm15
+; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7,8,9,10,11],ymm6[12],ymm5[13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm4
+; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm1
+; AVX512DQ-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0
; AVX512DQ-NEXT: vmovdqu64 %ymm19, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
@@ -15759,65 +15815,64 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm6
; AVX512DQ-NEXT: vpor %ymm6, %ymm8, %ymm3
; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm13[3],ymm9[4,5],ymm13[6],ymm9[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm19
-; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm16
+; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm3
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm6
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm8
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpshufb %xmm6, %xmm8, %xmm8
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,2]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm17[0,1,1,2]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,1,0,3,4,5,4,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm3
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3]
; AVX512DQ-NEXT: vmovdqa64 %xmm10, %xmm21
-; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm24
; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm3[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,3,2,3,4,5,6,7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm9, %zmm8, %zmm3
; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1],ymm13[2,3],ymm12[4,5],ymm13[6,7]
+; AVX512DQ-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7,8,9,10,11],ymm9[12],ymm8[13,14,15]
; AVX512DQ-NEXT: vpshufb %ymm7, %ymm8, %ymm7
-; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm13
-; AVX512DQ-NEXT: vmovdqu64 %ymm30, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm10
-; AVX512DQ-NEXT: vmovdqu64 %ymm29, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm13[1],ymm10[2,3],ymm13[4],ymm10[5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm10
+; AVX512DQ-NEXT: vmovdqu64 %ymm26, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6,7]
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1],xmm8[2,3,4,5],xmm9[6],xmm8[7]
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm8, %ymm5
; AVX512DQ-NEXT: vpor %ymm7, %ymm5, %ymm3
; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm15
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm15[3],ymm2[4,5],ymm15[6],ymm2[7]
-; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm9
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm9
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2],ymm9[3],ymm14[4,5],ymm9[6],ymm14[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm14, %ymm16
; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm7
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1],xmm7[2,3,4,5],xmm5[6],xmm7[7]
; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm5
; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm22[0,1,1,2]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm25[0,1,1,2]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,0,3,4,5,4,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm6
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm2
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm14
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm19
; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm3
; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2],ymm14[3],ymm1[4,5],ymm14[6],ymm1[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2],ymm15[3],ymm2[4,5],ymm15[6],ymm2[7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm8
@@ -15825,49 +15880,52 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm7
; AVX512DQ-NEXT: vpor %ymm7, %ymm8, %ymm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0],ymm13[1],ymm10[2,3,4],ymm13[5],ymm10[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0],ymm10[1],ymm4[2,3,4],ymm10[5],ymm4[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5,6,7]
; AVX512DQ-NEXT: vpshufb %ymm6, %ymm7, %ymm6
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[1,3,2,3]
; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm5
; AVX512DQ-NEXT: vpor %ymm5, %ymm6, %ymm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm11
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm0[1],ymm11[2,3,4],ymm0[5],ymm11[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm16 = ymm18[0,1,1,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,3]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm5
; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[0,1,2,1,4,5,6,5]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,1,4,5,6,5]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
; AVX512DQ-NEXT: vpbroadcastw 232(%rdi), %xmm6
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm8
-; AVX512DQ-NEXT: vpsrlq $48, %xmm21, %xmm7
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm2
+; AVX512DQ-NEXT: vpsrlq $48, %xmm28, %xmm7
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm3
; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm4
+; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm9
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0],ymm9[1],ymm4[2,3,4],ymm9[5],ymm4[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm22[0,1,1,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm25[0,1,1,3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[0,1,2,1,4,5,6,5]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[0,1,2,1,4,5,6,5]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
; AVX512DQ-NEXT: vpbroadcastw 680(%rdi), %xmm3
-; AVX512DQ-NEXT: vpsrlq $48, %xmm25, %xmm5
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm10
+; AVX512DQ-NEXT: vpsrlq $48, %xmm24, %xmm5
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm1, %zmm1
; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm12
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm0[2],ymm11[3,4],ymm0[5],ymm11[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm12
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3],xmm1[4],xmm3[5,6,7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,8,9,6,7,4,5,2,3,2,3,2,3,2,3]
@@ -15880,14 +15938,13 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,7,6]
; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm7[5,6,7],ymm3[8,9,10,11,12],ymm7[13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm0
-; AVX512DQ-NEXT: vpsrld $16, %xmm24, %xmm7
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm13
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3
-; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm9[2],ymm15[3,4],ymm9[5],ymm15[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm0
+; AVX512DQ-NEXT: vpsrld $16, %xmm21, %xmm7
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm13
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm2
+; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2],xmm7[3],xmm3[4],xmm7[5,6,7]
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1
@@ -15899,12 +15956,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,7,6]
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm9
-; AVX512DQ-NEXT: vpsrld $16, %xmm17, %xmm8
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm2
+; AVX512DQ-NEXT: vpsrld $16, %xmm19, %xmm8
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm7, %zmm7
; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm4[0,1],ymm12[2,3],ymm4[4,5],ymm12[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm8
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3],xmm8[4],xmm7[5],xmm8[6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm5[0],xmm6[1],xmm5[2,3,4,5,6,7]
@@ -15920,23 +15977,23 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm9[3],ymm4[4,5],ymm9[6],ymm4[7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm6
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,7,7]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm20[0,1,2,1,4,5,6,5]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm22[0,1,2,1,4,5,6,5]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
; AVX512DQ-NEXT: movw $992, %ax # imm = 0x3E0
; AVX512DQ-NEXT: kmovw %eax, %k1
-; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm26 {%k1} # 16-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm18
-; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm25
+; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm29 {%k1} # 16-byte Folded Reload
+; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm9[2,3],ymm4[4,5],ymm9[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm19
+; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm27
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm6
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3],xmm6[4],xmm0[5],xmm6[6,7]
; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm0
@@ -15946,430 +16003,428 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm7
+; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm8
; AVX512DQ-NEXT: vmovdqa 736(%rdi), %ymm6
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4],xmm0[5],xmm1[6,7]
; AVX512DQ-NEXT: vmovdqa 800(%rdi), %ymm3
; AVX512DQ-NEXT: vmovdqa 768(%rdi), %ymm5
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm3[3],ymm5[4,5],ymm3[6],ymm5[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm24
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm16
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7,8,9,10,11,12,13],ymm2[14],ymm1[15]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,14,15,12,13,10,11,4,5,6,7,8,9,2,3,16,17,30,31,28,29,26,27,20,21,22,23,24,25,18,19]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5,6],xmm2[7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 832(%rdi), %ymm9
-; AVX512DQ-NEXT: vmovdqa 864(%rdi), %ymm8
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT: vmovdqa 832(%rdi), %ymm15
+; AVX512DQ-NEXT: vmovdqa 864(%rdi), %ymm9
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1],ymm15[2],ymm9[3,4],ymm15[5],ymm9[6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,10,11,8,9,4,5,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm3
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm12[3],ymm4[4,5],ymm12[6],ymm4[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm20
-; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm21
+; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm24
+; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm25
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm28[0,1,2,1,4,5,6,5]
+; AVX512DQ-NEXT: vpshufd $100, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm2 = mem[0,1,2,1,4,5,6,5]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,5,8,9,10,11,12,13,14,13]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 {%k1} # 16-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm11
-; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm15
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k1} # 16-byte Folded Reload
+; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm13
+; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm11
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm13[2,3],ymm11[4,5],ymm13[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3],xmm2[4],xmm0[5],xmm2[6,7]
-; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm14
+; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm7
; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6],ymm10[7,8,9,10,11,12,13],ymm12[14],ymm10[15]
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm10, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6],ymm0[7,8,9,10,11,12,13],ymm12[14],ymm0[15]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,1]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3,4,5,6],xmm1[7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm1
-; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm13
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm12[6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3,4,5,6],xmm0[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm0
+; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm0[2],ymm14[3,4],ymm0[5],ymm14[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm12, %xmm10
+; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7]
; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm19
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm12
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2,3],xmm10[4],xmm12[5],xmm10[6],xmm12[7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6],ymm13[7,8],ymm12[9,10,11,12,13,14],ymm13[15]
-; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm13 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
-; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm10
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm12, %ymm12
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5,6],xmm12[7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm23
-; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm4
-; AVX512DQ-NEXT: vextracti32x4 $1, %ymm12, %xmm28
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm28[2,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm11[6,7]
-; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4],xmm11[5],xmm10[6],xmm11[7]
-; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm10
-; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6],ymm12[7,8],ymm11[9,10,11,12,13,14],ymm12[15]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm11, %ymm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4,5,6],xmm0[7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,0,2,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm29
+; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm10
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2,3],xmm4[4],xmm10[5],xmm4[6],xmm10[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6],ymm12[7,8],ymm10[9,10,11,12,13,14],ymm12[15]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm12 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm4, %xmm4
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,1,14,15,12,13,4,5,4,5,4,5,4,5,18,19,16,17,30,31,28,29,20,21,20,21,20,21,20,21]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm10, %ymm10
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3,4,5,6],xmm10[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm20
+; AVX512DQ-NEXT: vextracti32x4 $1, %ymm10, %xmm22
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,10,11,10,11,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm10
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm22[2,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm10[6,7]
+; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm8[3],ymm6[4,5],ymm8[6],ymm6[7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm10
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm10[0,1,2,3],xmm4[4],xmm10[5],xmm4[6],xmm10[7]
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm4, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm13
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5,6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6],ymm12[7,8],ymm10[9,10,11,12,13,14],ymm12[15]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm10, %ymm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3,4,5,6],xmm1[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm15[2,3],ymm9[4,5],ymm15[6,7]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,2,4,5,6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23]
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm10
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm3[1],ymm7[2,3,4],ymm3[5],ymm7[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm21
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm16
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm4
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4,5,6,7,8],ymm0[9],ymm10[10,11,12,13,14,15]
-; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm13
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm15[2],ymm13[3,4,5],ymm15[6],ymm13[7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm12
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4,5,6,7,8],ymm0[9],ymm4[10,11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1],ymm11[2],ymm2[3,4,5],ymm11[6],ymm2[7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm10
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm10[4],xmm4[5],xmm10[6],xmm4[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm10
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm0 & ~ymm19)
-; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
-; AVX512DQ-NEXT: vextracti32x4 $1, %ymm0, %xmm28
+; AVX512DQ-NEXT: vpshufb %ymm12, %ymm4, %ymm4
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm22 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm0 & ~ymm22)
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm7[3],ymm14[4,5],ymm7[6],ymm14[7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm10
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm28[0,1,2,1]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,7]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,7]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm10 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm10 = zmm10 ^ (zmm17 & (zmm10 ^ mem))
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm3 = zmm3 ^ (zmm18 & (zmm3 ^ mem))
; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512DQ-NEXT: kmovw %eax, %k1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm10 {%k1}
-; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm10
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7]
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm11
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
+; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm10
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm6[2],ymm7[3,4,5],ymm6[6],ymm7[7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm15
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm15[4],xmm11[5],xmm15[6],xmm11[7]
-; AVX512DQ-NEXT: vpshufb %ymm12, %ymm11, %ymm11
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm0 & ~ymm19)
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm12
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7,8],ymm0[9],ymm1[10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm6[2],ymm8[3,4,5],ymm6[6],ymm8[7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7]
+; AVX512DQ-NEXT: vpshufb %ymm12, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & ~ymm22)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm15[3],ymm9[4,5],ymm15[6],ymm9[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm23
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,7]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,7]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm11 = zmm11 ^ (zmm17 & (zmm11 ^ mem))
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1}
-; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3],ymm0[4],ymm11[5,6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm1 = zmm1 ^ (zmm18 & (zmm1 ^ mem))
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm16[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
-; AVX512DQ-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm27
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm1[3],ymm13[4,5],ymm1[6],ymm13[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm19
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm26
-; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm15, %ymm13, %ymm13
-; AVX512DQ-NEXT: vpor %ymm0, %ymm13, %ymm0
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,0,0,0,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,7,6,5,4]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2],ymm11[3,4,5,6,7],ymm13[8,9,10],ymm11[11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm27 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm27 = zmm27 ^ (zmm17 & (zmm27 ^ mem))
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1}
-; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3,4,5],xmm11[6],xmm0[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm24
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm20
+; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm17
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm14[2],ymm7[3,4,5],ymm14[6],ymm7[7]
+; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm3
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4],xmm1[5],xmm4[6],xmm1[7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm2
+; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[1,1,2,0]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm13, %ymm12, %ymm12
+; AVX512DQ-NEXT: vpor %ymm0, %ymm12, %ymm0
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2],ymm1[3,4,5,6,7],ymm12[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm24 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm24 = zmm24 ^ (zmm18 & (zmm24 ^ mem))
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm24 {%k1}
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,0,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,6,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm29[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7]
-; AVX512DQ-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm11 = mem[0,1,2,3,6,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[2,2,2,2]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm0, %zmm29
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm24
-; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm25
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm23
-; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm21
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[1,1,2,0]
-; AVX512DQ-NEXT: vpshufb %ymm15, %ymm11, %ymm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm9[0,1],ymm8[2],ymm9[3,4,5],ymm8[6],ymm9[7]
-; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm6
-; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm20
-; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm13
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3],xmm13[4],xmm12[5],xmm13[6],xmm12[7]
-; AVX512DQ-NEXT: vpor %ymm0, %ymm11, %ymm0
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,0,0,0,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,7,6,5,4]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm29 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm29 = zmm29 ^ (zmm17 & (zmm29 ^ mem))
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,4,8,9,10,11,12,13,15,12]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm1 = mem[0,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm29
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm26
+; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm22
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm15
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7]
+; AVX512DQ-NEXT: vmovdqa %ymm10, %ymm8
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,1,2,0]
+; AVX512DQ-NEXT: vpshufb %ymm13, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm13
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1],ymm9[2],ymm13[3,4,5],ymm9[6],ymm13[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm23
+; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm11
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm11[4],xmm4[5],xmm11[6],xmm4[7]
+; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7],ymm1[8,9,10],ymm4[11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm29 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm29 = zmm29 ^ (zmm18 & (zmm29 ^ mem))
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1}
; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm12
+; AVX512DQ-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
+; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm1
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6],ymm12[7,8,9,10,11,12,13],ymm0[14],ymm12[15]
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm31
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm30
-; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm13
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm0[2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm8
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm18
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm12
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm0[1],xmm12[2,3,4,5],xmm0[6],xmm12[7]
-; AVX512DQ-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
-; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm13
-; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm14[0,1,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7,8,9,10],ymm15[11],ymm12[12,13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm5
-; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0],ymm5[1],ymm1[2,3],ymm5[4],ymm1[5,6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1],xmm15[2,3,4,5],xmm10[6],xmm15[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
+; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4],ymm1[5],ymm4[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm28
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm25
+; AVX512DQ-NEXT: vmovdqa64 %ymm14, %ymm27
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm2[2,3],ymm7[4,5],ymm2[6,7]
+; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm11
+; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm21[0,1,0,1]
+; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm31
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7,8,9,10],ymm4[11],ymm1[12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm5
+; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm3
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5,6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm12
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm12[1],xmm4[2,3,4,5],xmm12[6],xmm4[7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm12, %ymm12
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm10
-; AVX512DQ-NEXT: vpor %ymm12, %ymm10, %ymm10
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm0, %zmm28
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm14
-; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm7 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm7 = mem ^ (zmm17 & (zmm7 ^ mem))
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
-; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
-; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm7
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4,5],ymm0[6],ymm7[7]
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm0, %ymm10
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7,8,9,10,11,12,13],ymm0[14],ymm10[15]
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm7
-; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1],ymm9[2],ymm7[3,4],ymm9[5],ymm7[6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[2,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (zmm14 & (zmm12 ^ zmm0))
-; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm9
-; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm7
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0],xmm0[1],xmm10[2,3,4,5],xmm0[6],xmm10[7]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm14
-; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm3
-; AVX512DQ-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1],ymm3[2,3],ymm14[4,5],ymm3[6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm23[0,1,0,1]
-; AVX512DQ-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7,8,9,10],ymm11[11],ymm10[12,13,14,15]
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm10, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm6
-; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm15
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm6[0],ymm15[1],ymm6[2,3],ymm15[4],ymm6[5,6,7]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5],xmm11[6],xmm10[7]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm2
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm9 = mem ^ (zmm18 & (zmm9 ^ mem))
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm9 {%k1}
+; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7]
+; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7,8,9,10,11,12,13],ymm0[14],ymm1[15]
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Reload
+; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm18 & (zmm17 ^ zmm0))
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm7[3],ymm13[4,5],ymm7[6],ymm13[7]
+; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm9
+; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm15[2,3],ymm8[4,5],ymm15[6,7]
+; AVX512DQ-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,0,1]
+; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7,8,9,10],ymm4[11],ymm1[12,13,14,15]
+; AVX512DQ-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm6
+; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0],ymm14[1],ymm6[2,3],ymm14[4],ymm6[5,6,7]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm10
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm10[1],xmm4[2,3,4,5],xmm10[6],xmm4[7]
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1}
-; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm0 = mem[0,1,2],ymm13[3],mem[4,5],ymm13[6],mem[7]
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm17 {%k1}
+; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm13
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm20
+; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm13
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3],ymm13[4],ymm1[5,6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm21
-; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm25
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm2[1],xmm10[2],xmm2[3],xmm10[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm3[1],ymm5[2,3,4],ymm3[5],ymm5[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm25
+; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm26
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,12,13,26,27,24,25,22,23,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm11[3],ymm4[4,5],ymm11[6],ymm4[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm26
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm17
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm22
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm5
-; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm11[2,3],ymm5[4,5],ymm11[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm18
+; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm10
+; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm11
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm12
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
-; AVX512DQ-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [10,11,10,11,10,11,10,11,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
+; AVX512DQ-NEXT: vpshufb %ymm5, %ymm1, %ymm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm24 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm12 = mem ^ (zmm24 & (zmm12 ^ mem))
+; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm12 = mem ^ (zmm18 & (zmm12 ^ mem))
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1}
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm15[3],ymm8[4,5],ymm15[6],ymm8[7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7,8,9,10,11],ymm1[12],ymm0[13,14,15]
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm30
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm14
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5,6,7]
+; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm15
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4,5],xmm2[6],xmm1[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
-; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm15[1],ymm6[2,3,4],ymm15[5],ymm6[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm31
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0],xmm2[1],xmm10[2],xmm2[3],xmm10[4,5,6,7]
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm14, %ymm28
+; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm16
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5,6,7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm6
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm7
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm7[3],ymm6[4,5],ymm7[6],ymm6[7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6],ymm2[7,8],ymm1[9,10,11,12,13,14],ymm2[15]
-; AVX512DQ-NEXT: vpshufb %ymm8, %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm10
-; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm15
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm10[2,3],ymm15[4,5],ymm10[6,7]
+; AVX512DQ-NEXT: vpshufb %ymm5, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm5
+; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm8
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
@@ -16377,15 +16432,15 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm24 & (zmm7 ^ zmm1))
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm7 {%k1}
-; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3],ymm4[4],ymm0[5,6,7]
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm1))
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm14 {%k1}
+; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm11[3],ymm1[4,5],ymm11[6],ymm1[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
@@ -16394,19 +16449,19 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [12,13,12,13,12,13,12,13,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1],ymm0[2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm3
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm0
; AVX512DQ-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4,5],ymm0[6],mem[7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,3,1]
@@ -16418,16 +16473,16 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7],ymm3[8,9,10],ymm1[11,12,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm24, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm11 = mem ^ (zmm24 & (zmm11 ^ mem))
+; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm18, %zmm11 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm11 = mem ^ (zmm18 & (zmm11 ^ mem))
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm11 {%k1}
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6,7,8],ymm3[9],ymm1[10,11,12,13,14,15]
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,6,4,6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm5
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
@@ -16435,22 +16490,22 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5,6,7]
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm24 & (zmm6 ^ zmm1))
-; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm18 & (zmm6 ^ zmm1))
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0],ymm9[1],ymm15[2,3,4],ymm9[5],ymm15[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm5
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2],xmm1[3],xmm5[4,5,6,7]
; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm28, %ymm4
+; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,0,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4,5],mem[6],ymm5[7]
+; AVX512DQ-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm5 = mem[0,1],ymm5[2],mem[3,4,5],ymm5[6],mem[7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,3,1]
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm5, %ymm0
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1,2],ymm0[3,4,5,6,7]
@@ -16466,14 +16521,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
; AVX512DQ-NEXT: # zmm4 = mem ^ (zmm0 & (zmm4 ^ mem))
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm28 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm28 = zmm28 ^ (zmm0 & (zmm28 ^ mem))
+; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm30 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm30 = zmm30 ^ (zmm0 & (zmm30 ^ mem))
; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm23 # 64-byte Folded Reload
; AVX512DQ-NEXT: # zmm23 = zmm23 ^ (zmm0 & (zmm23 ^ mem))
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm1))
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm4))
; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rsi)
-; AVX512DQ-NEXT: vmovdqa64 %zmm28, 64(%rsi)
+; AVX512DQ-NEXT: vmovdqa64 %zmm30, 64(%rsi)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rdx)
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -16481,14 +16536,14 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-NEXT: vmovaps %zmm0, (%rcx)
; AVX512DQ-NEXT: vmovdqa64 %zmm29, 64(%r8)
-; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%r8)
+; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%r8)
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-NEXT: vmovaps %zmm0, (%r9)
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-NEXT: vmovaps %zmm0, 64(%r9)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%rax)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%rax)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rax)
@@ -16498,708 +16553,745 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-FCP-LABEL: load_i16_stride7_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $1240, %rsp # imm = 0x4D8
-; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm18
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,5,9,0,12,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm17, %zmm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
-; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm3, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: subq $1960, %rsp # imm = 0x7A8
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm26
+; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm11
; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm12[2],ymm14[3,4,5],ymm12[6],ymm14[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm6, %ymm20
-; AVX512DQ-FCP-NEXT: vmovdqa 672(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm19
-; AVX512DQ-FCP-NEXT: vpbroadcastw 700(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4,5],ymm11[6],ymm14[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm3, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 672(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,0,1,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm18
+; AVX512DQ-FCP-NEXT: vpbroadcastw 700(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm24[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm7[2],ymm15[3,4,5],ymm7[6],ymm15[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm6[4],xmm5[5],xmm6[6],xmm5[7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT: vporq %ymm1, %ymm3, %ymm31
-; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm13
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm24[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1],ymm5[2],ymm8[3,4,5],ymm5[6],ymm8[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm9
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm0
+; AVX512DQ-FCP-NEXT: vpbroadcastw 252(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm12
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,0,12,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm1, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,0,1,6,7,8,9,18,19,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm7[3],ymm15[4,5],ymm7[6],ymm15[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm16
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm5[3],ymm8[4,5],ymm5[6],ymm8[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm16
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1],xmm4[2,3,4,5],xmm2[6],xmm4[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm4[0,1,2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29,22,23,28,29]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0],xmm5[1],xmm13[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm13, %xmm27
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2],ymm12[3],ymm14[4,5],ymm12[6],ymm14[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm23
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3,4,5],xmm9[6],xmm10[7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm17, %zmm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm8, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm0[2],ymm1[3,4,5],ymm0[6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm13[2],ymm4[3,4,5],ymm13[6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,4,5,6,7,6,7,12,13,16,17,18,19,20,21,22,23,20,21,22,23,22,23,28,29]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm12[0],xmm9[1],xmm12[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,0,1,14,15,14,15,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm23
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm25
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4,5],ymm1[6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm21
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm30
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm8[4],xmm3[5],xmm8[6],xmm3[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %ymm22
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm22[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm4[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-FCP-NEXT: vmovdqa 688(%rdi), %xmm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0],xmm1[1],xmm14[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm7
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,6,9,0,13,0,0,0]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm0
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %ymm17
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm17[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vmovdqa 688(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0],xmm3[1],xmm8[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm10
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [2,6,9,0,13,0,0,0]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm9
; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm16, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0],ymm15[1],ymm10[2,3],ymm15[4],ymm10[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm28
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm4, %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm9[1],ymm5[2,3],ymm9[4],ymm5[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm18, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm19
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2],ymm11[3],ymm3[4,5],ymm11[6],ymm3[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1],xmm8[2,3,4,5],xmm6[6],xmm8[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm25
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm16 = [2,5,2,5,2,5,2,5]
-; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm16, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm19
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm15
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm30
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm28
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm24[0,1,1,2]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,2,3,4,5,6,7,0,1,2,3,0,1,14,15,16,17,18,19,20,21,22,23,16,17,18,19,16,17,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm27
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm2
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4,5],ymm0[6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,0,0,0,5,8,12,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm7
; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3,4,5],xmm15[6],xmm13[7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm9
-; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm21
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0],xmm4[1],xmm7[2,3,4,5],xmm4[6],xmm7[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm16, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm30
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm8
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3,4,5],xmm5[6],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm18, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm13
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0,1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3,4,5],xmm2[6],xmm5[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm17[0,1,1,2]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm18
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 736(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2],ymm2[3,4,5],ymm14[6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm10
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm10[4],xmm2[5],xmm10[6],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm31
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm16, %zmm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm9[1],ymm11[2,3,4],ymm9[5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [10,3,6,15,12,13,6,15]
-; AVX512DQ-FCP-NEXT: vpermd %zmm28, %zmm16, %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm9, %ymm1
+; AVX512DQ-FCP-NEXT: vpermd %zmm19, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2,3,4,5],xmm4[6],xmm2[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,4,5,6,7,8,9,10,11,2,3,8,9,16,17,18,19,20,21,22,23,24,25,26,27,18,19,24,25]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm24[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3,4,5],xmm9[6],xmm4[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25,18,19,24,25]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm24[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [12,13,10,11,12,13,10,11,12,13,10,11,12,13,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm23
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm14[0],xmm6[1],xmm14[2],xmm6[3],xmm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm13
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm13, %ymm8
-; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3,4],ymm6[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vpermd %zmm26, %zmm16, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0],ymm5[1],ymm12[2,3],ymm5[4],ymm12[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm8[1],xmm6[2,3,4,5],xmm8[6],xmm6[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm22[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm29
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0],ymm3[1],ymm11[2,3,4],ymm3[5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6,7]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm17[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm22
+; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm17, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm5
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm27, %xmm7
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm1, %zmm28
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm21
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [3,6,10,13,3,6,10,13]
-; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm27, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: movw $992, %ax # imm = 0x3E0
-; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm26, %zmm1, %zmm20 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0],ymm2[1],ymm12[2,3,4],ymm2[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [0,1,2,3,4,5,6,7,8,9,10,11,4,5,10,11,16,17,18,19,20,21,22,23,24,25,26,27,20,21,26,27]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastw 232(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm8
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm27, %xmm4
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm2, %zmm27
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
; AVX512DQ-FCP-NEXT: vpbroadcastw 680(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm4
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm30, %xmm2
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm18, %xmm2
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm26
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [2,5,9,12,2,5,9,12]
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm1, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7],ymm6[8,9,10,11,12],ymm3[13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm11
-; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm19, %xmm6
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa 736(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm16
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm17
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3],xmm6[4],xmm3[5],xmm6[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm30
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,0,0,0,4,7,11,14]
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm20, %zmm14
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3,4,5,6],xmm14[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm8
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm27, %zmm14
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5,6],ymm8[7]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm31 {%k1} # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm1, %zmm8
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7],ymm8[8,9,10,11,12],ymm0[13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpsrld $16, %xmm9, %xmm8
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm31
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,4,7,0,0,4,7,0]
-; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm7, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,6,9,13,2,6,9,13]
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm8, %zmm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm28
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,3,3,3,0,3,7,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm20, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [2,5,9,12,2,5,9,12]
+; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm21, %zmm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7],ymm4[8,9,10,11,12],ymm3[13,14,15]
+; AVX512DQ-FCP-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm17
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm12 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3],xmm14[4],xmm5[5],xmm14[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18
-; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm20, %zmm14
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm14, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm6[0,1,2],xmm10[3,4,5,6],xmm6[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm14
-; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2],ymm11[3],ymm15[4,5],ymm11[6],ymm15[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [1,0,0,0,4,8,11,15]
-; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm20, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3,4,5,6],xmm10[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm8, %zmm10
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2],ymm0[3],ymm5[4,5],ymm0[6],ymm5[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm1[4],xmm10[5],xmm1[6],xmm10[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm20, %zmm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6],xmm3[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm8, %zmm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm15[2],ymm11[3,4,5],ymm15[6],ymm11[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm16
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,0,0,0,5,8,12,15]
-; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm3, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm27, %zmm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm6 = zmm6 ^ (zmm25 & (zmm6 ^ mem))
-; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm19
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,6,10,13,3,6,10,13]
+; AVX512DQ-FCP-NEXT: vpermd %zmm19, %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT: movw $992, %ax # imm = 0x3E0
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 {%k1} # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4],xmm4[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm6 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,4,7,11,14]
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm16, %zmm14
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3,4,5,6],xmm14[7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14
+; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm21, %zmm12
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm12
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [4,5,6,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm12
+; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm3, %zmm15
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm2 {%k1} # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1,2],xmm7[3],xmm12[4],xmm7[5],xmm12[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermd %zmm29, %zmm16, %zmm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3,4,5,6],xmm1[7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm21, %zmm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm16
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4],xmm7[5],xmm6[6],xmm7[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,8,11,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm29, %zmm12, %zmm15
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4,5,6],xmm15[7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm11
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm9, %zmm6
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm15 = [18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29,18,19,20,21,26,27,28,29]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2],ymm13[3],ymm10[4,5],ymm13[6],ymm10[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm18
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm11
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm11[0,1,2,3],xmm6[4],xmm11[5],xmm6[6],xmm11[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm12, %zmm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1,2],xmm6[3,4,5,6],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm9, %zmm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm6
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm5[2],ymm0[3,4,5],ymm5[6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm14
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4],xmm1[5],xmm6[6],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm27, %zmm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm1 = zmm1 ^ (zmm25 & (zmm1 ^ mem))
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermd %zmm14, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm20, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
+; AVX512DQ-FCP-NEXT: vpermd %zmm19, %zmm21, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,14,15,14,15,14,15,16,17,16,17,22,23,24,25,30,31,30,31,30,31,30,31]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
+; AVX512DQ-FCP-NEXT: vpsrld $16, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm30 = [0,4,7,0,0,4,7,0]
+; AVX512DQ-FCP-NEXT: # ymm30 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm30, %ymm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
-; AVX512DQ-FCP-NEXT: vpermd %zmm21, %zmm8, %zmm1
+; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm9, %zmm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm2
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm8[2],ymm11[3,4,5],ymm8[6],ymm11[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2],ymm15[3],ymm7[4,5],ymm15[6],ymm7[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3,4,5],xmm2[6],xmm3[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5],ymm15[6],ymm14[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,14,15,12,13,10,11,8,9]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [2,11,2,11,12,5,8,9]
-; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm9, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,11,2,11,12,5,8,9]
+; AVX512DQ-FCP-NEXT: vpermd %zmm29, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm23 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm23 = zmm23 ^ (zmm25 & (zmm23 ^ mem))
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm23 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm1 = zmm1 ^ (zmm25 & (zmm1 ^ mem))
+; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
+; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 864(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa 832(%rdi), %ymm13
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2],ymm13[3,4,5],ymm1[6],ymm13[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm31
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm9, %zmm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2],ymm5[3],ymm14[4,5],ymm5[6],ymm14[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm23
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm22
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm9
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2,3,4,5],xmm3[6],xmm9[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa 832(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm1[2],ymm4[3,4,5],ymm1[6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm24
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm17
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm6, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm2[1],xmm6[2,3,4,5],xmm2[6],xmm6[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm29 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm29 = zmm29 ^ (zmm25 & (zmm29 ^ mem))
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm29 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm1 = zmm1 ^ (zmm25 & (zmm1 ^ mem))
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm17
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm21
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,3,7,10,14,0,0,0]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm16, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm25 & (zmm27 ^ zmm1))
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm15, %ymm19
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm20
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3,4,5],xmm1[6],xmm6[7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0],ymm8[1],ymm3[2,3],ymm8[4],ymm3[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm18
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3,4,5],xmm12[6],xmm6[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm22
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,0,0,0,6,9,13,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm29, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm6, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm27 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm1
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,3,7,10,14,0,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm21, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm16, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm25 & (zmm28 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm8[3],ymm11[4,5],ymm8[6],ymm11[7]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm19
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm11
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3,4,5],xmm10[6],xmm3[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [2,0,0,0,6,9,13,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm28 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm2[2],ymm14[3,4],ymm2[5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm2
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm21, %zmm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm29
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm25 & (zmm26 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm16, %zmm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm16
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm26 {%k1}
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm9[2,3],ymm6[4,5],ymm9[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm23
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm31
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm28 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm21
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm22
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm28
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm27
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [0,4,7,11,14,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0],ymm12[1],ymm6[2,3],ymm12[4],ymm6[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm21
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm10
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3,4,5],xmm10[6],xmm3[7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm15, %ymm19
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1],xmm12[2],xmm10[3],xmm12[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [3,0,0,0,6,10,13,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm1
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,4,7,11,14,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm17, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm26
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm14[1],ymm11[2,3,4],ymm14[5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2],xmm5[3],xmm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [3,0,0,0,6,10,13,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm29, %zmm18, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm10, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm12 = mem ^ (zmm25 & (zmm12 ^ mem))
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm20
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm19
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm17, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm15[1],ymm4[2,3],ymm15[4],ymm4[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm24
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3,4,5],xmm4[6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm18, %zmm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2],xmm4[3],xmm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm1 = mem ^ (zmm25 & (zmm1 ^ mem))
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,8,11,15,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm23, %zmm3, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2],xmm2[3],xmm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1],ymm14[2],ymm11[3,4],ymm14[5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm8
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm30, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [2,6,9,13,2,6,9,13]
+; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[4,5,2,3,4,5,10,11,12,13,12,13,12,13,12,13,20,21,18,19,20,21,26,27,28,29,28,29,28,29,28,29]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm9 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm9 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[8,9,6,7,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm9
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,10,3,14,7,10,3]
+; AVX512DQ-FCP-NEXT: vpermd %zmm29, %zmm7, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0],ymm5[1,2],ymm11[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7],ymm5[8,9,10],ymm4[11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm0 # 64-byte Folded Reload
; AVX512DQ-FCP-NEXT: # zmm0 = mem ^ (zmm25 & (zmm0 ^ mem))
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm10[2,3],ymm14[4,5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1],ymm3[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm20
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm16, %zmm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm10
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4],ymm15[5],ymm3[6,7]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm9 = mem ^ (zmm25 & (zmm9 ^ mem))
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm9 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,4,8,11,15,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm15
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm8[1],ymm6[2,3,4],ymm8[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,8,9,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm6
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,1,10,3,14,7,10,3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm16, %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [4,5,10,11,0,1,10,11,0,1,4,5,0,1,14,15,20,21,26,27,16,17,26,27,16,17,20,21,16,17,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,12,13,6,7,4,5,2,3,0,1,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm12 = mem ^ (zmm25 & (zmm12 ^ mem))
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm5, %zmm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4],ymm13[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm11
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm11[0],xmm4[1],xmm11[2],xmm4[3],xmm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm3
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,3,1,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm16, %zmm7
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2],ymm2[3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,3,1,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm7, %zmm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0],ymm2[1,2],ymm5[3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7],ymm2[8,9,10],ymm3[11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm1 = mem ^ (zmm25 & (zmm1 ^ mem))
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm9 ^ (zmm25 & (zmm4 ^ zmm9))
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm4 {%k1}
; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm2 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm7 = mem ^ (zmm2 & (zmm7 ^ mem))
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm9 = mem ^ (zmm2 & (zmm9 ^ mem))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ mem))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ mem))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm3 & (zmm6 ^ zmm7))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm3 & (zmm8 ^ zmm9))
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm3 = mem ^ (zmm2 & (zmm3 ^ mem))
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq $226, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm4 = mem ^ (zmm2 & (zmm4 ^ mem))
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm5 = zmm5 ^ (zmm2 & (zmm5 ^ mem))
-; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm6 = zmm6 ^ (zmm2 & (zmm6 ^ mem))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm2 & (zmm10 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm2 & (zmm8 ^ zmm4))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rsi)
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm3 = zmm3 ^ (zmm25 & (zmm3 ^ mem))
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm7 = zmm7 ^ (zmm25 & (zmm7 ^ mem))
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 {%k1} # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 {%k1} # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rcx)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rcx)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%r8)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%r9)
-; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%r9)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%r9)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rax)
-; AVX512DQ-FCP-NEXT: addq $1240, %rsp # imm = 0x4D8
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm1, (%rax)
+; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
+; AVX512DQ-FCP-NEXT: addq $1960, %rsp # imm = 0x7A8
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
index fff21f9aad1bb..2bb2962be73aa 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
@@ -496,15 +496,15 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3]
-; AVX512-NEXT: vpermt2d %xmm4, %xmm9, %xmm5
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,11,3,3]
+; AVX512-NEXT: vpermt2d %ymm4, %ymm9, %ymm5
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-NEXT: vpermt2d %xmm2, %xmm9, %xmm0
+; AVX512-NEXT: vpermi2d %ymm2, %ymm0, %ymm9
; AVX512-NEXT: vmovq %xmm6, (%rsi)
; AVX512-NEXT: vmovq %xmm7, (%rdx)
; AVX512-NEXT: vmovq %xmm8, (%rcx)
@@ -512,7 +512,8 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vmovq %xmm1, (%r9)
; AVX512-NEXT: vmovq %xmm3, (%r11)
; AVX512-NEXT: vmovq %xmm4, (%r10)
-; AVX512-NEXT: vmovq %xmm0, (%rax)
+; AVX512-NEXT: vmovq %xmm9, (%rax)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride8_vf4:
@@ -527,18 +528,18 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1]
-; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm8
-; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,9,1,1]
+; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm8
+; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm7, %ymm8
; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3]
-; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,11,3,3]
+; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm10, %ymm5
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7
+; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm0, %ymm7
; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0
+; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm0, %ymm10
; AVX512-FCP-NEXT: vmovq %xmm6, (%rsi)
; AVX512-FCP-NEXT: vmovq %xmm8, (%rdx)
; AVX512-FCP-NEXT: vmovq %xmm9, (%rcx)
@@ -546,7 +547,8 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vmovq %xmm1, (%r9)
; AVX512-FCP-NEXT: vmovq %xmm7, (%r11)
; AVX512-FCP-NEXT: vmovq %xmm3, (%r10)
-; AVX512-FCP-NEXT: vmovq %xmm0, (%rax)
+; AVX512-FCP-NEXT: vmovq %xmm10, (%rax)
+; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride8_vf4:
@@ -564,15 +566,15 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3]
-; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm9, %xmm5
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,11,3,3]
+; AVX512DQ-NEXT: vpermt2d %ymm4, %ymm9, %ymm5
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512DQ-NEXT: vpermt2d %xmm2, %xmm9, %xmm0
+; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm0, %ymm9
; AVX512DQ-NEXT: vmovq %xmm6, (%rsi)
; AVX512DQ-NEXT: vmovq %xmm7, (%rdx)
; AVX512DQ-NEXT: vmovq %xmm8, (%rcx)
@@ -580,7 +582,8 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vmovq %xmm1, (%r9)
; AVX512DQ-NEXT: vmovq %xmm3, (%r11)
; AVX512DQ-NEXT: vmovq %xmm4, (%r10)
-; AVX512DQ-NEXT: vmovq %xmm0, (%rax)
+; AVX512DQ-NEXT: vmovq %xmm9, (%rax)
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i16_stride8_vf4:
@@ -595,18 +598,18 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm8
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,9,1,1]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm8
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm7, %ymm8
; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,11,3,3]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm10, %ymm5
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm0, %ymm7
; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm0, %ymm10
; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rsi)
; AVX512DQ-FCP-NEXT: vmovq %xmm8, (%rdx)
; AVX512DQ-FCP-NEXT: vmovq %xmm9, (%rcx)
@@ -614,7 +617,8 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9)
; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r11)
; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r10)
-; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax)
+; AVX512DQ-FCP-NEXT: vmovq %xmm10, (%rax)
+; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i16_stride8_vf4:
@@ -1118,254 +1122,256 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512-LABEL: load_i16_stride8_vf8:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,8]
; AVX512-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-NEXT: vpermt2d %xmm5, %xmm0, %xmm6
-; AVX512-NEXT: vmovdqa (%rdi), %xmm5
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm10
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm11
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm12
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,1,1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm15 = [3,7,0,0]
-; AVX512-NEXT: vpermt2d %xmm13, %xmm15, %xmm14
-; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
+; AVX512-NEXT: vpermt2d %ymm5, %ymm0, %ymm6
+; AVX512-NEXT: vmovdqa (%rdi), %xmm10
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm11
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm12
+; AVX512-NEXT: vmovdqa 48(%rdi), %xmm13
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[1,1,1,1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,0,0]
+; AVX512-NEXT: vpermt2d %ymm14, %ymm8, %ymm15
+; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3]
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512-NEXT: vpermi2d %xmm1, %xmm2, %xmm0
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[1,1,1,1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,2,2]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT: vpermi2d %ymm1, %ymm2, %ymm0
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[1,1,1,1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm3[1],xmm11[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512-NEXT: vpermt2d %xmm3, %xmm15, %xmm4
+; AVX512-NEXT: vpermi2d %ymm3, %ymm4, %ymm8
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,2,2]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
-; AVX512-NEXT: vmovdqa %xmm6, (%rsi)
-; AVX512-NEXT: vmovdqa %xmm7, (%rdx)
-; AVX512-NEXT: vmovdqa %xmm8, (%rcx)
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
+; AVX512-NEXT: vmovdqa %xmm5, (%rsi)
+; AVX512-NEXT: vmovdqa %xmm6, (%rdx)
+; AVX512-NEXT: vmovdqa %xmm7, (%rcx)
; AVX512-NEXT: vmovdqa %xmm9, (%r8)
; AVX512-NEXT: vmovdqa %xmm0, (%r9)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovdqa %xmm5, (%rax)
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa %xmm10, (%rax)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vmovdqa %xmm3, (%rax)
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa %xmm1, (%rax)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride8_vf8:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,8]
+; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm3
; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm4
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm17
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm5
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,4]
-; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm0
-; AVX512-FCP-NEXT: vpermt2d %xmm10, %xmm3, %xmm0
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm7
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm8
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm13
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm8
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm0
+; AVX512-FCP-NEXT: vpermt2d %ymm12, %ymm2, %ymm0
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm6
+; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm9
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm10
+; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm11
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm16
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0]
-; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm2
-; AVX512-FCP-NEXT: vpermt2d %xmm14, %xmm9, %xmm2
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm0
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6]
-; AVX512-FCP-NEXT: vpermt2d %xmm10, %xmm1, %xmm0
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,0,0]
-; AVX512-FCP-NEXT: vpermt2d %xmm14, %xmm10, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm14
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm3
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
-; AVX512-FCP-NEXT: vpermi2d %xmm6, %xmm7, %xmm9
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm1
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpermt2d %xmm6, %xmm10, %xmm7
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,9,0,0]
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm0
+; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm0
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm12[0],xmm14[1],xmm12[1]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm17
+; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm15
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm14[2],xmm12[2],xmm14[3],xmm12[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [0,0,2,10]
+; AVX512-FCP-NEXT: vpermt2d %ymm12, %ymm14, %ymm15
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm13[2],xmm1[2],xmm13[3],xmm1[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [3,11,3,3]
+; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm15, %ymm13
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm4, %ymm7
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
+; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm4, %ymm15
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3]
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
; AVX512-FCP-NEXT: vmovdqa64 %xmm16, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa %xmm11, (%r8)
-; AVX512-FCP-NEXT: vmovdqa %xmm3, (%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %xmm17, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm12, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %xmm13, (%r8)
+; AVX512-FCP-NEXT: vmovdqa %xmm2, (%r9)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa %xmm8, (%rax)
+; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rax)
+; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rax)
+; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rax)
+; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride8_vf8:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,8]
; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm1
; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-NEXT: vpermt2d %xmm5, %xmm0, %xmm6
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5
-; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm10
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm11
-; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm12
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,1,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm15 = [3,7,0,0]
-; AVX512DQ-NEXT: vpermt2d %xmm13, %xmm15, %xmm14
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
+; AVX512DQ-NEXT: vpermt2d %ymm5, %ymm0, %ymm6
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm11
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm12
+; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm13
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm15[1,1,1,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,0,0]
+; AVX512DQ-NEXT: vpermt2d %ymm14, %ymm8, %ymm15
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX512DQ-NEXT: vpermi2d %xmm1, %xmm2, %xmm0
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[1,1,1,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,2,2]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm2, %ymm0
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[1,1,1,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm3[1],xmm11[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512DQ-NEXT: vpermt2d %xmm3, %xmm15, %xmm4
+; AVX512DQ-NEXT: vpermi2d %ymm3, %ymm4, %ymm8
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,2,2]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
-; AVX512DQ-NEXT: vmovdqa %xmm6, (%rsi)
-; AVX512DQ-NEXT: vmovdqa %xmm7, (%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm8, (%rcx)
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
+; AVX512DQ-NEXT: vmovdqa %xmm5, (%rsi)
+; AVX512DQ-NEXT: vmovdqa %xmm6, (%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm7, (%rcx)
; AVX512DQ-NEXT: vmovdqa %xmm9, (%r8)
; AVX512DQ-NEXT: vmovdqa %xmm0, (%r9)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vmovdqa %xmm5, (%rax)
-; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa %xmm10, (%rax)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-NEXT: vmovdqa %xmm3, (%rax)
+; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa %xmm1, (%rax)
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i16_stride8_vf8:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,8]
+; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm3
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm17
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,4]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm0
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm10, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm13
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm0
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm12, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm9
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm10
+; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm11
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm16
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm2
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm14, %xmm9, %xmm2
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm10, %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,0,0]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm14, %xmm10, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm14
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm6, %xmm7, %xmm9
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm6, %xmm10, %xmm7
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,9,0,0]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm12[0],xmm14[1],xmm12[1]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm17
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm15
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm14[2],xmm12[2],xmm14[3],xmm12[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [0,0,2,10]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm12, %ymm14, %ymm15
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm13[2],xmm1[2],xmm13[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [3,11,3,3]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm15, %ymm13
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm4, %ymm7
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm4, %ymm15
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,3]
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%r9)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rax)
+; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i16_stride8_vf8:
@@ -2477,6 +2483,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-LABEL: load_i16_stride8_vf16:
; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,8]
; AVX512-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX512-NEXT: vmovdqa 96(%rdi), %xmm1
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2487,164 +2494,164 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-NEXT: vmovdqa64 %xmm1, %xmm26
; AVX512-NEXT: vmovdqa64 %xmm0, %xmm27
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
; AVX512-NEXT: vmovdqa64 %xmm0, %xmm30
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-NEXT: vpermt2d %xmm5, %xmm7, %xmm6
-; AVX512-NEXT: vmovdqa (%rdi), %xmm9
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm10
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-NEXT: vpermt2d %ymm5, %ymm7, %ymm6
+; AVX512-NEXT: vmovdqa (%rdi), %xmm10
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm11
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm12
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm13
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
-; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm18
-; AVX512-NEXT: vpermq {{.*#+}} ymm17 = ymm18[0,1,0,2]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm20
-; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm20[0,1,0,2]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7]
-; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm23
-; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm23[0,1,0,2]
+; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm20
+; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm20[0,1,0,2]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm22
+; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm22[0,1,0,2]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm17
+; AVX512-NEXT: vpermq {{.*#+}} ymm23 = ymm17[0,1,0,2]
; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm16
-; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm16[0,1,0,2]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm2, %ymm28
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm16[0,1,0,2]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm23[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm28
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,1,1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
-; AVX512-NEXT: vmovdqa64 %xmm29, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm0, %ymm29
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
+; AVX512-NEXT: vmovdqa64 %xmm29, %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm29
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
; AVX512-NEXT: vmovdqa64 %xmm30, %xmm6
; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
-; AVX512-NEXT: vpermt2d %xmm4, %xmm17, %xmm14
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm19
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,11,0,0]
+; AVX512-NEXT: vpermt2d %ymm8, %ymm9, %ymm0
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512-NEXT: vmovdqa64 %xmm24, %xmm0
; AVX512-NEXT: vmovdqa64 %xmm25, %xmm1
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512-NEXT: vmovdqa64 %xmm26, %xmm0
; AVX512-NEXT: vmovdqa64 %xmm27, %xmm2
; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX512-NEXT: vpermi2d %xmm1, %xmm2, %xmm7
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-NEXT: vpermi2d %ymm1, %ymm2, %ymm7
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm7[2,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,1,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm23[0,1,1,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm20[0,1,1,3]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm18[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm22[0,1,1,3]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm17[0,1,1,3]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5],ymm6[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm16[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,2,2]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-NEXT: vpermt2d %xmm4, %xmm17, %xmm3
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,1,1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm3[1],xmm7[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm11 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-NEXT: vpermi2d %ymm3, %ymm4, %ymm9
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,2,2]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512-NEXT: vmovdqa64 %ymm28, (%rsi)
; AVX512-NEXT: vmovdqa64 %ymm29, (%rdx)
; AVX512-NEXT: vmovdqa64 %ymm19, (%rcx)
; AVX512-NEXT: vmovdqa %ymm8, (%r8)
-; AVX512-NEXT: vmovdqa %ymm10, (%r9)
+; AVX512-NEXT: vmovdqa %ymm0, (%r9)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa %ymm7, (%rax)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovdqa %ymm5, (%rax)
+; AVX512-NEXT: vmovdqa %ymm3, (%rax)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512-NEXT: vmovdqa %ymm1, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i16_stride8_vf16:
; AVX512-FCP: # %bb.0:
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,8]
; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2652,12 +2659,11 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm27
; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm28
; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm29
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
-; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm13
-; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm13
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm12
+; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm7, %ymm12
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm9
@@ -2667,143 +2673,143 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm30
; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm31
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm14[0,1],xmm13[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm16[0,1,0,2]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm20 = ymm17[0,1,0,2]
; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,0,4,5,6,4]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7]
-; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm19
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm19[0,1,0,2]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7]
+; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm18[0,1,0,2]
; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm23[0,1,0,2]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm24
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
-; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm1
-; AVX512-FCP-NEXT: vpermt2d %xmm5, %xmm15, %xmm1
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm24
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [1,9,0,0]
+; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm0
+; AVX512-FCP-NEXT: vpermt2d %ymm5, %ymm12, %ymm0
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm25
-; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm0
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,0,2,6]
-; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm12, %xmm0
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm0
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm4[2],xmm13[3],xmm4[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [0,0,2,10]
+; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm14, %ymm0
; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[0,1,1,3,4,5,5,7]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[3,1,2,3,7,5,6,7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [3,7,0,0]
-; AVX512-FCP-NEXT: vpermt2d %xmm5, %xmm18, %xmm11
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm20
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [3,11,3,3]
+; AVX512-FCP-NEXT: vpermt2d %ymm5, %ymm13, %ymm11
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm0
; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm0
; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm1
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm4, %xmm7
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm4, %ymm7
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm20 = ymm16[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm16[0,1,1,3]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm17[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6],ymm9[7]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm19[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,3]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm17[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm18[0,1,1,3]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7]
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm23[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vpermi2d %xmm6, %xmm5, %xmm15
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm4, %xmm12
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm16[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpermi2d %ymm5, %ymm6, %ymm12
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5],ymm8[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm4, %ymm14
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,1,3,4,5,5,7]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm17[0,1,1,3,4,5,5,7]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2d %xmm6, %xmm18, %xmm5
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpermi2d %ymm5, %ymm6, %ymm13
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa64 %ymm24, (%rsi)
; AVX512-FCP-NEXT: vmovdqa64 %ymm25, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%r8)
; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r9)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rax)
@@ -2816,6 +2822,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-LABEL: load_i16_stride8_vf16:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,8]
; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm1
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2826,164 +2833,164 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm26
; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm27
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm30
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-NEXT: vpermt2d %xmm5, %xmm7, %xmm6
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9
-; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm10
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-NEXT: vpermt2d %ymm5, %ymm7, %ymm6
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm11
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm12
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm13
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
-; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm18
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm17 = ymm18[0,1,0,2]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm20
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm19 = ymm20[0,1,0,2]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7]
-; AVX512DQ-NEXT: vmovdqa64 160(%rdi), %ymm23
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm23[0,1,0,2]
+; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm20
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm18 = ymm20[0,1,0,2]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm22
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm19 = ymm22[0,1,0,2]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vmovdqa64 160(%rdi), %ymm17
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm23 = ymm17[0,1,0,2]
; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm16
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm16[0,1,0,2]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm28
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm16[0,1,0,2]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm23[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5],ymm9[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm28
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,1,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm29
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm29
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm6
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
-; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm17, %xmm14
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm19
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,11,0,0]
+; AVX512DQ-NEXT: vpermt2d %ymm8, %ymm9, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm0
; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm1
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm0
; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm2
; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; AVX512DQ-NEXT: vpermi2d %xmm1, %xmm2, %xmm7
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm2, %ymm7
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm7[2,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,1,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm23[0,1,1,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm18 = ymm20[0,1,1,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm18[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm22[0,1,1,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm17[0,1,1,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5],ymm6[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm16[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,2,2]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm17, %xmm3
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,1,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm3[1],xmm7[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm12[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm16[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-NEXT: vpermi2d %ymm3, %ymm4, %ymm9
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,2,2,2]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa64 %ymm28, (%rsi)
; AVX512DQ-NEXT: vmovdqa64 %ymm29, (%rdx)
; AVX512DQ-NEXT: vmovdqa64 %ymm19, (%rcx)
; AVX512DQ-NEXT: vmovdqa %ymm8, (%r8)
-; AVX512DQ-NEXT: vmovdqa %ymm10, (%r9)
+; AVX512DQ-NEXT: vmovdqa %ymm0, (%r9)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa %ymm7, (%rax)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vmovdqa %ymm5, (%rax)
+; AVX512DQ-NEXT: vmovdqa %ymm3, (%rax)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-NEXT: vmovdqa %ymm1, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i16_stride8_vf16:
; AVX512DQ-FCP: # %bb.0:
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,8]
; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2991,12 +2998,11 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm27
; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm28
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm29
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm13
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm13
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm12
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm7, %ymm12
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm9
@@ -3006,143 +3012,143 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm30
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm31
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm14[0,1],xmm13[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm16[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm20 = ymm17[0,1,0,2]
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,0,4,5,6,4]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm19
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm19[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm18[0,1,0,2]
; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm23[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm24
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm1
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm5, %xmm15, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm24
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [1,9,0,0]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm0
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm5, %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm25
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,0,2,6]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm4[2],xmm13[3],xmm4[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [0,0,2,10]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm14, %ymm0
; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm19[0,1,1,3,4,5,5,7]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[3,1,2,3,7,5,6,7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm22[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [3,7,0,0]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm5, %xmm18, %xmm11
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm20
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [3,11,3,3]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm5, %ymm13, %ymm11
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm1
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm4, %xmm7
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm4, %ymm7
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm20 = ymm16[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm16[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm17[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm19[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm17[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm17[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm18[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7]
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm23[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm6, %xmm5, %xmm15
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm4, %xmm12
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm16[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm5, %ymm6, %ymm12
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm4, %ymm14
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,1,3,4,5,5,7]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm17[0,1,1,3,4,5,5,7]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm6, %xmm18, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm15[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm5, %ymm6, %ymm13
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm16[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%r8)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r9)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rax)
@@ -5719,80 +5725,80 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-LABEL: load_i16_stride8_vf32:
; AVX512: # %bb.0:
; AVX512-NEXT: subq $616, %rsp # imm = 0x268
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,8]
; AVX512-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vmovdqa 352(%rdi), %xmm1
; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-NEXT: vmovdqa 336(%rdi), %xmm0
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa 320(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm26
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm25 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512-NEXT: vmovdqa %xmm5, %xmm0
-; AVX512-NEXT: vpermt2d %xmm3, %xmm1, %xmm0
-; AVX512-NEXT: vmovdqa 304(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa 320(%rdi), %xmm2
; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa 288(%rdi), %xmm3
-; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT: vmovdqa 272(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vmovdqa 256(%rdi), %xmm3
-; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm26 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm25
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-NEXT: vpermt2d %ymm1, %ymm3, %ymm2
+; AVX512-NEXT: vmovdqa 304(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa 288(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vmovdqa 272(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa 256(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm12[0],xmm7[1],xmm12[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16
; AVX512-NEXT: vmovdqa 480(%rdi), %ymm0
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21
-; AVX512-NEXT: vmovdqa 448(%rdi), %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpermq {{.*#+}} ymm20 = ymm2[0,1,0,2]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm20
+; AVX512-NEXT: vmovdqa 448(%rdi), %ymm1
+; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm1[0,1,0,2]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT: vmovdqa 416(%rdi), %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm2[0,1,0,2]
-; AVX512-NEXT: vmovdqa 384(%rdi), %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm2[0,1,0,2]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT: vmovdqa 416(%rdi), %ymm1
+; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm1[0,1,0,2]
+; AVX512-NEXT: vmovdqa 384(%rdi), %ymm1
+; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm28[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX512-NEXT: movb $-64, %al
; AVX512-NEXT: kmovw %eax, %k1
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1}
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3
-; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2
; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm8[3]
-; AVX512-NEXT: vmovdqa64 %xmm3, %xmm23
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-NEXT: vpermt2d %xmm0, %xmm1, %xmm2
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm8[3]
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm22
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm17 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm23 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT: vpermt2d %ymm0, %ymm3, %ymm2
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
@@ -5803,27 +5809,27 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm2[2,3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0,1],xmm2[2,3]
; AVX512-NEXT: vmovdqa 224(%rdi), %ymm0
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm0[0,1,0,2]
; AVX512-NEXT: vmovdqa 192(%rdi), %ymm0
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm31[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm14 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm15 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,6],ymm0[7]
; AVX512-NEXT: vmovdqa 160(%rdi), %ymm0
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2]
-; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm28
-; AVX512-NEXT: vpermq {{.*#+}} ymm17 = ymm28[0,1,0,2]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,1,0,2]
+; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm27
+; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm27[0,1,0,2]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[0,2,2,3,4,6,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,2,2,3,4,6,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
@@ -5831,20 +5837,21 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3]
-; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm4
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
-; AVX512-NEXT: vmovdqa64 %ymm21, %ymm4
+; AVX512-NEXT: vmovdqa64 %ymm20, %ymm4
; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm6
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -5853,66 +5860,67 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
-; AVX512-NEXT: vmovdqa64 %xmm19, %xmm4
+; AVX512-NEXT: vmovdqa64 %xmm17, %xmm4
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3]
-; AVX512-NEXT: vmovdqa64 %xmm26, %xmm1
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm28[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5],ymm11[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm30[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; AVX512-NEXT: vmovdqa64 %xmm23, %xmm15
-; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm30[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm15[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX512-NEXT: vmovdqa64 %xmm22, %xmm15
+; AVX512-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,0,0]
-; AVX512-NEXT: vpermt2d %xmm11, %xmm6, %xmm7
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm0
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,11,0,0]
+; AVX512-NEXT: vpermt2d %ymm12, %ymm0, %ymm7
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm30
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm0
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT: vpermt2d %xmm8, %xmm6, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1
+; AVX512-NEXT: vpermt2d %ymm8, %ymm30, %ymm3
+; AVX512-NEXT: vmovdqa64 %xmm23, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -5925,167 +5933,166 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm30
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vmovdqa %xmm1, %xmm2
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
-; AVX512-NEXT: vpermt2d %xmm0, %xmm1, %xmm2
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT: vmovdqa64 %xmm2, %xmm31
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm21 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,8]
+; AVX512-NEXT: vpermt2d %ymm0, %ymm3, %ymm1
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
; AVX512-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX512-NEXT: vmovdqa64 %xmm5, %xmm20
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12
-; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm19 = mem[0,1,1,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm21 = mem[0,1,1,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm21[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; AVX512-NEXT: vmovdqa64 %ymm4, %ymm23
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm11
+; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm28 = mem[0,1,1,3]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm28[0,1,2,0,4,5,6,4]
; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
; AVX512-NEXT: # ymm29 = mem[0,1,1,3]
-; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm23 = mem[0,1,1,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12 {%k1}
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm20 = mem[0,1,1,3]
+; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm19 = mem[0,1,1,3]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 {%k1}
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX512-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm3, %xmm16
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm18
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
+; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT: vmovdqa %ymm3, %ymm0
+; AVX512-NEXT: vpermi2d %ymm1, %ymm2, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm17
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm16
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm10 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,3]
-; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm17 = mem[0,1,1,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
; AVX512-NEXT: # ymm24 = mem[0,1,1,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm24[0,1,2,0,4,5,6,4]
; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
; AVX512-NEXT: # ymm25 = mem[0,1,1,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm28[0,1,1,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm25[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm26 = mem[0,1,1,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,1,3]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5],ymm7[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm27
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm12
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2
+; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm22
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
+; AVX512-NEXT: vmovdqa64 %ymm23, %ymm11
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3]
+; AVX512-NEXT: vmovdqa64 %xmm18, %xmm2
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vmovdqa64 %xmm16, %xmm9
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm16[0],xmm18[0],xmm16[1],xmm18[1]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm17[0],xmm16[0],xmm17[1],xmm16[1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,1,1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm12[2],xmm4[3],xmm12[3]
-; AVX512-NEXT: vmovdqa64 %xmm12, %xmm16
-; AVX512-NEXT: vmovdqa64 %xmm30, %xmm1
+; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm23
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm11[2],xmm6[3],xmm11[3]
+; AVX512-NEXT: vmovdqa %ymm11, %ymm12
+; AVX512-NEXT: vmovdqa64 %xmm31, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm23[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5],ymm11[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm20[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm0[7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm26[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[2,2,2,2]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0,1,2],xmm12[3]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm12 = [3,7,0,0]
-; AVX512-NEXT: vpermt2d %xmm16, %xmm12, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm31, %xmm11
-; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3]
-; AVX512-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1}
-; AVX512-NEXT: vpermt2d %xmm8, %xmm12, %xmm5
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm18[2],xmm9[3],xmm18[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-NEXT: vpermt2d %ymm12, %ymm30, %ymm6
+; AVX512-NEXT: vpermi2d %ymm9, %ymm10, %ymm30
+; AVX512-NEXT: vmovdqa64 %ymm30, %ymm12
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm16[2,2,2,2]
+; AVX512-NEXT: vmovdqa64 %ymm17, %ymm9
+; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm3
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3]
+; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm16[2],xmm9[3],xmm16[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512-NEXT: vmovaps %zmm2, (%rsi)
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
@@ -6094,9 +6101,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovaps %zmm2, (%rcx)
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512-NEXT: vmovaps %zmm2, (%r8)
-; AVX512-NEXT: vmovdqa64 %zmm27, (%r9)
+; AVX512-NEXT: vmovdqa64 %zmm22, (%r9)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovdqa64 %zmm20, (%rax)
+; AVX512-NEXT: vmovdqa64 %zmm23, (%rax)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
@@ -6107,42 +6114,42 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-FCP-LABEL: load_i16_stride8_vf32:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $552, %rsp # imm = 0x228
+; AVX512-FCP-NEXT: subq $424, %rsp # imm = 0x1A8
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,8]
; AVX512-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512-FCP-NEXT: vmovdqa 336(%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
-; AVX512-FCP-NEXT: vmovdqa %xmm3, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm16
-; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
-; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm2
+; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm22
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3
; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm3
-; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm3
-; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
+; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm19
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm11
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm8
; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm21[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm27 = ymm1[0,1,0,2]
@@ -6155,28 +6162,28 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm28[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm29[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX512-FCP-NEXT: movb $-64, %al
; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1}
; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT: vmovdqa %xmm13, %xmm0
-; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm2, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm25
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm0
+; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm3, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
@@ -6186,384 +6193,384 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm4
; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm16
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm31 = ymm23[0,1,0,2]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
-; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
-; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm31 = ymm24[0,1,0,2]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm25
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm25[0,1,0,2]
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm26[0,1,0,2]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,2,2,3,4,6,6,7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,5,0,0]
-; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0
-; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm11, %xmm0
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,9,0,0]
+; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm0
+; AVX512-FCP-NEXT: vpermt2d %ymm19, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm15
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm23[0],xmm22[0],xmm23[1],xmm22[1]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm1
-; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm11, %xmm1
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm25[0],xmm13[1],xmm25[1]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm1
+; AVX512-FCP-NEXT: vpermt2d %ymm16, %ymm15, %ymm1
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm14[0],xmm20[0],xmm14[1],xmm20[1]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
-; AVX512-FCP-NEXT: vpermt2d %xmm22, %xmm0, %xmm2
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm5[7]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm3 {%k1}
-; AVX512-FCP-NEXT: vmovdqa %xmm13, %xmm7
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm13[2],xmm25[2],xmm13[3],xmm25[3]
-; AVX512-FCP-NEXT: vpermt2d %xmm25, %xmm0, %xmm7
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm17[2],xmm6[3],xmm17[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0]
-; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm27, %xmm9
-; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm2
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm22 = xmm23[2],xmm2[2],xmm23[3],xmm2[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,10]
+; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm9
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm19[2],xmm11[3],xmm19[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm2
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm28[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5],ymm8[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm2 {%k1}
+; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm3
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm20[2],xmm14[3],xmm20[3]
+; AVX512-FCP-NEXT: vpermt2d %ymm20, %ymm9, %ymm3
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm7[2],xmm16[2],xmm7[3],xmm16[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm31[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5,6],ymm8[7]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm18[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm31
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,11,3,3]
+; AVX512-FCP-NEXT: vpermt2d %ymm19, %ymm2, %ymm11
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm30
+; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm27, %xmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm10[2,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; AVX512-FCP-NEXT: vpermt2d %ymm16, %ymm30, %ymm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm5[2,3]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm27
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,8]
+; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm2, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm17
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm29
+; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm23 = mem[0,1,1,3]
+; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm22 = mem[0,1,1,3]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm23[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm22[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm21 = mem[0,1,1,3]
+; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm20 = mem[0,1,1,3]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm29 {%k1}
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm1
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
-; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm0, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm18
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm31
-; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm29 = mem[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm26 = mem[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm1
+; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm5, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm16
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm19
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm28 = mem[0,1,1,3]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,1,1,3]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,1,3]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,1,1,3]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm25[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29
+; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [1,9,0,0]
+; AVX512-FCP-NEXT: vpermt2d %ymm17, %ymm12, %ymm0
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm18[0],xmm8[1],xmm18[1]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm30 = mem[0,1,1,3]
-; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm25 = mem[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm31 {%k1}
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm16
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm20 = mem[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm19 = mem[0,1,1,3]
-; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm21 = mem[0,1,1,3]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31
-; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,0,0]
-; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm4, %xmm0
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa %xmm4, %xmm3
-; AVX512-FCP-NEXT: vpermi2d %xmm16, %xmm7, %xmm3
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm22[0],xmm17[1],xmm22[1]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm8
-; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm2
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
-; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm0, %xmm8
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm25[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
-; AVX512-FCP-NEXT: vpermi2d %xmm22, %xmm17, %xmm0
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm7[2],xmm16[2],xmm7[3],xmm16[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm21[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm27, %xmm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm10[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm4
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm4 {%k1}
+; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm3
+; AVX512-FCP-NEXT: vpermi2d %ymm10, %ymm11, %ymm3
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm16[0],xmm19[0],xmm16[1],xmm19[1]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm13
+; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm1
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm18
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [0,0,2,10]
+; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm14, %ymm13
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm17[2],xmm6[3],xmm17[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm13[2,3]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm22[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm20[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5],ymm12[6,7]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm3 {%k1}
+; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm8
+; AVX512-FCP-NEXT: vpermi2d %ymm19, %ymm16, %ymm8
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm8[2,3]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm24[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm1
+; AVX512-FCP-NEXT: vpermt2d %ymm17, %ymm30, %ymm6
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm25[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpermi2d %ymm10, %ymm11, %ymm1
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm26[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5],ymm15[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm19[2],xmm16[3],xmm19[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
-; AVX512-FCP-NEXT: vpermt2d %xmm16, %xmm27, %xmm7
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm22[2],xmm17[3],xmm22[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm2, (%rsi)
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm28, (%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm31, (%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm31, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%r9)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512-FCP-NEXT: addq $552, %rsp # imm = 0x228
+; AVX512-FCP-NEXT: addq $424, %rsp # imm = 0x1A8
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride8_vf32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: subq $616, %rsp # imm = 0x268
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,8]
; AVX512DQ-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vmovdqa 352(%rdi), %xmm1
; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512DQ-NEXT: vmovdqa 336(%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm1
-; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm26
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm25 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-NEXT: vmovdqa %xmm5, %xmm0
-; AVX512DQ-NEXT: vpermt2d %xmm3, %xmm1, %xmm0
-; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm2
+; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm2
; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm3
-; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm2
-; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm3
-; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm26 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm25
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-NEXT: vpermt2d %ymm1, %ymm3, %ymm2
+; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm0
+; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm0
+; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm12[0],xmm7[1],xmm12[1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16
; AVX512DQ-NEXT: vmovdqa 480(%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21
-; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm2[0,1,0,2]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm1
+; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm1[0,1,0,2]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm21[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm2[0,1,0,2]
-; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm2[0,1,0,2]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm1
+; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm1[0,1,0,2]
+; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm1
+; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm28[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX512DQ-NEXT: movb $-64, %al
; AVX512DQ-NEXT: kmovw %eax, %k1
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1}
-; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm2
-; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm0
+; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3
-; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2
; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm8[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm23
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm1, %xmm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm8[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm22
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm17 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm23 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-NEXT: vpermt2d %ymm0, %ymm3, %ymm2
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
@@ -6574,27 +6581,27 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm2[2,3]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0,1],xmm2[2,3]
; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm0[0,1,0,2]
; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm31[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm14 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm15 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,6],ymm0[7]
; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm0
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2]
-; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm28
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm17 = ymm28[0,1,0,2]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm0[0,1,0,2]
+; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm27
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm27[0,1,0,2]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm15[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
@@ -6602,20 +6609,21 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm4
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm4
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm4
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm6
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -6624,66 +6632,67 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm4
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm1
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm28[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5],ymm11[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm30[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm15
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm30[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm15[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm15
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,0,0]
-; AVX512DQ-NEXT: vpermt2d %xmm11, %xmm6, %xmm7
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm0
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,11,0,0]
+; AVX512DQ-NEXT: vpermt2d %ymm12, %ymm0, %ymm7
+; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm30
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm0
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT: vpermt2d %xmm8, %xmm6, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-NEXT: vpermt2d %ymm8, %ymm30, %ymm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
@@ -6696,167 +6705,166 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm30
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm2
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
-; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm1, %xmm2
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm31
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm21 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,8]
+; AVX512DQ-NEXT: vpermt2d %ymm0, %ymm3, %ymm1
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm20
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12
-; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm19 = mem[0,1,1,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm21 = mem[0,1,1,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm21[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm23
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm11
+; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm28 = mem[0,1,1,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm28[0,1,2,0,4,5,6,4]
; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm29 = mem[0,1,1,3]
-; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm23 = mem[0,1,1,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12 {%k1}
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm20 = mem[0,1,1,3]
+; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm19 = mem[0,1,1,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm11 {%k1}
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512DQ-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm16
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm18
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
+; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm0
+; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm2, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm17
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm10 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,3]
-; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm17 = mem[0,1,1,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm24 = mem[0,1,1,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm24[0,1,2,0,4,5,6,4]
; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm25 = mem[0,1,1,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm28[0,1,1,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm25[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm26 = mem[0,1,1,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,1,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5],ymm7[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm27
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm12
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm2
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm22
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm11
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm2
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm9
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm16[0],xmm18[0],xmm16[1],xmm18[1]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm17[0],xmm16[0],xmm17[1],xmm16[1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,1,1,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm9[1],xmm3[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm12[2],xmm4[3],xmm12[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm16
-; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm1
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm23
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm11[2],xmm6[3],xmm11[3]
+; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm12
+; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm23[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5],ymm11[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm20[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm24[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm0[7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm26[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[2,2,2,2]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0,1,2],xmm12[3]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm12 = [3,7,0,0]
-; AVX512DQ-NEXT: vpermt2d %xmm16, %xmm12, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm11
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1}
-; AVX512DQ-NEXT: vpermt2d %xmm8, %xmm12, %xmm5
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm18[2],xmm9[3],xmm18[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-NEXT: vpermt2d %ymm12, %ymm30, %ymm6
+; AVX512DQ-NEXT: vpermi2d %ymm9, %ymm10, %ymm30
+; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm12
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm16[2,2,2,2]
+; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm9
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm3
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3]
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm16[2],xmm9[3],xmm16[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-NEXT: vmovaps %zmm2, (%rsi)
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
@@ -6865,9 +6873,9 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovaps %zmm2, (%rcx)
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-NEXT: vmovaps %zmm2, (%r8)
-; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%r9)
+; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%r9)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rax)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
@@ -6878,42 +6886,42 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-FCP-LABEL: load_i16_stride8_vf32:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $552, %rsp # imm = 0x228
+; AVX512DQ-FCP-NEXT: subq $424, %rsp # imm = 0x1A8
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,8]
; AVX512DQ-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512DQ-FCP-NEXT: vmovdqa 336(%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm22
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm3
; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
+; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm19
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm11
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm8
; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm21[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm27 = ymm1[0,1,0,2]
@@ -6926,28 +6934,28 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm28[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm29[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX512DQ-FCP-NEXT: movb $-64, %al
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm25
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm0
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
@@ -6957,304 +6965,304 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm4
; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm16
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm31 = ymm23[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm31 = ymm24[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm31[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm25
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm25[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm17 = ymm26[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,2,2,3,4,6,6,7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm11, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,9,0,0]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm0
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm19, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm15
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm23[0],xmm22[0],xmm23[1],xmm22[1]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4],ymm6[5],ymm8[6,7]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm1
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm11, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm25[0],xmm13[1],xmm25[1]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm1
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm16, %ymm15, %ymm1
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm14[0],xmm20[0],xmm14[1],xmm20[1]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm22, %xmm0, %xmm2
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, %xmm7
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm13[2],xmm25[2],xmm13[3],xmm25[3]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm25, %xmm0, %xmm7
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm17[2],xmm6[3],xmm17[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm27, %xmm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm22 = xmm23[2],xmm2[2],xmm23[3],xmm2[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,10]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm9
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm19[2],xmm11[3],xmm19[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm28[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm3
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm20[2],xmm14[3],xmm20[3]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm20, %ymm9, %ymm3
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm7[2],xmm16[2],xmm7[3],xmm16[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm31[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5,6],ymm8[7]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm18[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm15[5],ymm12[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm31
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,11,3,3]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm19, %ymm2, %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm30
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm27, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm10[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm16, %ymm30, %ymm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm5[2,3]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm27
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,0,0,8]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm17
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm29
+; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm23 = mem[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm22 = mem[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm23[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm22[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm21 = mem[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm20 = mem[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm20[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm29 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm1
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm0, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm24
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm18
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm31
-; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm29 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm26 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm16
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm28 = mem[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm24[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm26 = ymm26[0,1,1,3]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm25[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [1,9,0,0]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm17, %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm18[0],xmm8[1],xmm18[1]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm30 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm25 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm31 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm16
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm20 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm19 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm21 = mem[0,1,1,3]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm4, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm16, %xmm7, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm22[0],xmm17[1],xmm22[1]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm0, %xmm8
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm25[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm22, %xmm17, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm7[2],xmm16[2],xmm7[3],xmm16[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm21[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm27, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm10[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm4
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm4 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm3
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm10, %ymm11, %ymm3
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm16[0],xmm19[0],xmm16[1],xmm19[1]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm18
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm14 = [0,0,2,10]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm14, %ymm13
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm17[2],xmm6[3],xmm17[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm13[2,3]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm22[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm21[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm20[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5],ymm12[6,7]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm8
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm19, %ymm16, %ymm8
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm8[2,3]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm24[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm17, %ymm30, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm25[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm10, %ymm11, %ymm1
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm26[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5],ymm15[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm19[2],xmm16[3],xmm19[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm16, %xmm27, %xmm7
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm22[2],xmm17[3],xmm22[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%r9)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512DQ-FCP-NEXT: addq $552, %rsp # imm = 0x228
+; AVX512DQ-FCP-NEXT: addq $424, %rsp # imm = 0x1A8
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -12647,39 +12655,40 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-LABEL: load_i16_stride8_vf64:
; AVX512: # %bb.0:
; AVX512-NEXT: subq $2408, %rsp # imm = 0x968
-; AVX512-NEXT: vmovdqa 368(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa 352(%rdi), %xmm0
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,8]
+; AVX512-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-NEXT: vmovdqa 336(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa 320(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,0,0,4]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
+; AVX512-NEXT: vmovdqa 352(%rdi), %xmm2
; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT: vmovdqa 336(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa 320(%rdi), %xmm2
; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermt2d %xmm0, %xmm6, %xmm1
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa %ymm1, %ymm5
+; AVX512-NEXT: vpermt2d %ymm3, %ymm1, %ymm4
; AVX512-NEXT: vmovdqa 304(%rdi), %xmm0
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa 288(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa 288(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vmovdqa 272(%rdi), %xmm0
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa 256(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm31
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT: vmovdqa 256(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm31
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa 480(%rdi), %ymm1
; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -12702,12 +12711,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vmovdqa 384(%rdi), %ymm2
; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
-; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
@@ -12732,7 +12741,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermt2d %xmm1, %xmm6, %xmm2
+; AVX512-NEXT: vpermt2d %ymm1, %ymm5, %ymm2
+; AVX512-NEXT: vmovdqa %ymm5, %ymm6
; AVX512-NEXT: vmovdqa (%rdi), %xmm1
; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
@@ -12742,10 +12752,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm5
; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512-NEXT: vmovdqa64 %xmm3, %xmm17
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm17
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX512-NEXT: vmovdqa 224(%rdi), %ymm2
; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -12792,11 +12802,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermt2d %xmm0, %xmm6, %xmm1
-; AVX512-NEXT: vmovdqa %xmm6, %xmm10
+; AVX512-NEXT: vpermt2d %ymm0, %ymm6, %ymm1
+; AVX512-NEXT: vmovdqa %ymm6, %ymm10
; AVX512-NEXT: vmovdqa 816(%rdi), %xmm0
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vmovdqa 800(%rdi), %xmm2
@@ -12808,8 +12818,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm19
-; AVX512-NEXT: vmovdqa64 %xmm3, %xmm20
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm19
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm20
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-NEXT: vmovdqa 992(%rdi), %ymm1
@@ -12851,11 +12861,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2]
; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3]
-; AVX512-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermt2d %xmm1, %xmm10, %xmm2
+; AVX512-NEXT: vpermt2d %ymm1, %ymm10, %ymm2
; AVX512-NEXT: vmovdqa 560(%rdi), %xmm1
; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vmovdqa 544(%rdi), %xmm5
@@ -12870,12 +12880,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm2[2,3]
; AVX512-NEXT: vmovdqa 736(%rdi), %ymm1
; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
+; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm1[0,1,0,2]
; AVX512-NEXT: vmovdqa 704(%rdi), %ymm1
; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[0,1,2,0,4,5,6,4]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
@@ -12885,10 +12895,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm5[0,1,0,2]
; AVX512-NEXT: vmovdqa 640(%rdi), %ymm5
; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm5[0,1,0,2]
+; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm5[0,1,0,2]
; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm28[0,2,2,3,4,6,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm30[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm29[0,2,2,3,4,6,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5],ymm5[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm13[6,7]
@@ -12896,7 +12906,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm31[1,1,1,1]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
@@ -12924,7 +12934,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3]
; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
; AVX512-NEXT: # xmm9 = xmm9[0,1],mem[2,3]
@@ -12932,9 +12942,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[1,1,1,1]
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm9
+; AVX512-NEXT: vmovdqa64 %ymm20, %ymm9
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3]
-; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: vpblendd $12, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
@@ -12946,8 +12956,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
-; AVX512-NEXT: vmovdqa %xmm8, %xmm5
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
+; AVX512-NEXT: vmovdqa %ymm8, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3]
; AVX512-NEXT: vmovdqa64 %xmm16, %xmm4
; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
@@ -12961,7 +12971,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm14[2],xmm31[3],xmm14[3]
-; AVX512-NEXT: vmovdqa64 %xmm14, %xmm16
+; AVX512-NEXT: vmovdqa64 %ymm14, %ymm16
; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
@@ -13000,14 +13010,14 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm13[2],xmm17[3],xmm13[3]
-; AVX512-NEXT: vmovdqa64 %xmm13, %xmm22
+; AVX512-NEXT: vmovdqa64 %ymm13, %ymm22
; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
; AVX512-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm20[2],xmm19[3],xmm20[3]
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm24
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm9[2],xmm19[3],xmm9[3]
+; AVX512-NEXT: vmovdqa64 %ymm20, %ymm24
; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20
@@ -13027,10 +13037,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm5[2],xmm15[3],xmm5[3]
-; AVX512-NEXT: vmovdqa64 %xmm5, %xmm18
-; AVX512-NEXT: vpblendd $12, (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload
+; AVX512-NEXT: vmovdqa64 %ymm5, %ymm18
+; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
; AVX512-NEXT: # xmm3 = xmm0[0,1],mem[2,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[0,1,1,3,4,5,5,7]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX512-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
@@ -13038,17 +13048,17 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[3,1,2,3,7,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[3,1,2,3,7,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm31, %xmm1
-; AVX512-NEXT: vpermt2d %xmm16, %xmm0, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,11,0,0]
+; AVX512-NEXT: vmovdqa64 %ymm31, %ymm1
+; AVX512-NEXT: vpermt2d %ymm16, %ymm0, %ymm1
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm16
; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm1[0,1],mem[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
@@ -13064,8 +13074,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT: vmovdqa64 %xmm17, %xmm1
-; AVX512-NEXT: vpermt2d %xmm22, %xmm16, %xmm1
+; AVX512-NEXT: vmovdqa64 %ymm17, %ymm1
+; AVX512-NEXT: vpermt2d %ymm22, %ymm16, %ymm1
; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX512-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
; AVX512-NEXT: vmovdqa64 %ymm25, %ymm3
@@ -13079,8 +13089,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 %xmm19, %xmm0
-; AVX512-NEXT: vpermt2d %xmm24, %xmm16, %xmm0
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512-NEXT: vpermt2d %ymm24, %ymm16, %ymm0
; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
@@ -13092,7 +13102,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT: vpermt2d %xmm18, %xmm16, %xmm15
+; AVX512-NEXT: vpermt2d %ymm18, %ymm16, %ymm15
; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
; AVX512-NEXT: # xmm1 = xmm15[0,1],mem[2,3]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -13109,27 +13119,26 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
; AVX512-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,0,0,4]
-; AVX512-NEXT: vpermt2d %xmm5, %xmm10, %xmm6
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,2,2]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm28
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,0,0,8]
+; AVX512-NEXT: vpermt2d %ymm5, %ymm6, %ymm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm11 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX512-NEXT: # ymm1 = mem[0,1,1,3]
@@ -13172,16 +13181,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermt2d %xmm1, %xmm10, %xmm2
+; AVX512-NEXT: vpermt2d %ymm1, %ymm6, %ymm2
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
; AVX512-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-NEXT: vmovdqa64 %ymm4, %ymm19
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm22
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX512-NEXT: # ymm2 = mem[0,1,1,3]
@@ -13192,22 +13201,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT: vmovdqa64 %ymm4, %ymm17
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm30 = mem[0,1,1,3]
-; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm4 = mem[0,1,1,3]
-; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm3 = mem[0,1,1,3]
+; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm2 = mem[0,1,1,3]
+; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm17
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5],ymm8[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -13216,307 +13228,307 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX512-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,2,2]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3]
+; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpermt2d %xmm0, %xmm10, %xmm1
+; AVX512-NEXT: vpermt2d %ymm0, %ymm6, %ymm1
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX512-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-NEXT: vmovdqa64 %xmm3, %xmm16
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm20
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm20
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
-; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm0 = mem[0,1,1,3]
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm1 = mem[0,1,1,3]
-; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm22 = mem[0,1,1,3]
+; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm5
+; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm28 = mem[0,1,1,3]
; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
; AVX512-NEXT: # ymm29 = mem[0,1,1,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
+; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm28[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm27 = mem[0,1,1,3]
+; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm25 = mem[0,1,1,3]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm27[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 {%k1}
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX512-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-NEXT: vmovdqa %xmm10, %xmm1
-; AVX512-NEXT: vpermi2d %xmm2, %xmm8, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm8, %xmm19
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm24
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
-; AVX512-NEXT: vmovdqa64 %xmm13, %xmm31
-; AVX512-NEXT: vmovdqa64 %xmm11, %xmm21
-; AVX512-NEXT: vpblendd {{.*#+}} xmm14 = xmm0[0,1],xmm1[2,3]
+; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-NEXT: vmovdqa %ymm6, %ymm0
+; AVX512-NEXT: vpermi2d %ymm1, %ymm2, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm24
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; AVX512-NEXT: vmovdqa64 %ymm4, %ymm31
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX512-NEXT: # ymm1 = mem[0,1,1,3]
; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512-NEXT: # ymm0 = mem[0,1,1,3]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,0,4,5,6,4]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3,4,5,6],ymm8[7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,2,0,4,5,6,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm15[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512-NEXT: # ymm0 = mem[0,1,1,3]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm10 = mem[0,1,1,3]
-; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,2,2,3,4,6,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm2 = mem[0,1,1,3]
+; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm28[1,1,1,1]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
-; AVX512-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
-; AVX512-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT: vmovdqa64 %ymm17, %ymm13
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7]
-; AVX512-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2,3]
-; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm9 = xmm9[0,1],mem[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0
+; AVX512-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm4 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm5 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm4 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm17, %ymm5
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm22[1,1,1,1]
+; AVX512-NEXT: vmovdqa64 %ymm22, %ymm17
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3]
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm22
+; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm4 = xmm4[0,1],mem[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm16[1,1,1,1]
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm7
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
-; AVX512-NEXT: vmovdqa64 %xmm23, %xmm9
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm20[1,1,1,1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3]
+; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1]
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm6
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
-; AVX512-NEXT: vmovdqa64 %xmm19, %xmm5
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm31[1,1,1,1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3]
+; AVX512-NEXT: vmovdqa %ymm7, %ymm12
+; AVX512-NEXT: vmovdqa64 %ymm16, %ymm7
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm24[0],xmm7[1],xmm24[1]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm15[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm28[2],xmm10[2],xmm28[3],xmm10[3]
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
+; AVX512-NEXT: vmovdqa %ymm6, %ymm14
+; AVX512-NEXT: vmovdqa %ymm11, %ymm6
; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT: vmovdqa64 %ymm2, %ymm27
+; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm26
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm30
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm25
-; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm26
+; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm4, %ymm21
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21
-; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm19
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm15[2],xmm17[3],xmm15[3]
-; AVX512-NEXT: vmovdqa64 %xmm15, %xmm23
+; AVX512-NEXT: vmovdqa64 %ymm17, %ymm11
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm22[2],xmm11[3],xmm22[3]
; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
; AVX512-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3]
-; AVX512-NEXT: vmovdqa64 %xmm16, %xmm19
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm30
+; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm17
+; AVX512-NEXT: vmovdqa64 %ymm20, %ymm16
+; AVX512-NEXT: vmovdqa64 %ymm8, %ymm18
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm18[2],xmm16[3],xmm18[3]
; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20
-; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm24[2,2,2,2]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
-; AVX512-NEXT: vmovdqa64 %xmm5, %xmm22
-; AVX512-NEXT: vmovdqa64 %xmm31, %xmm4
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm6[2],xmm31[3],xmm6[3]
-; AVX512-NEXT: vmovdqa64 %xmm6, %xmm29
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
-; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm20
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm15
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm15 {%k1}
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,11,0,0]
+; AVX512-NEXT: vpermt2d %ymm14, %ymm2, %ymm6
+; AVX512-NEXT: vpermt2d %ymm22, %ymm2, %ymm11
+; AVX512-NEXT: vmovdqa64 %ymm11, %ymm25
+; AVX512-NEXT: vpermt2d %ymm18, %ymm2, %ymm16
+; AVX512-NEXT: vmovdqa64 %ymm16, %ymm22
+; AVX512-NEXT: vpermi2d %ymm12, %ymm31, %ymm2
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm12[2],xmm31[3],xmm12[3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm24[2,2,2,2]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3]
+; AVX512-NEXT: vmovdqa64 %ymm7, %ymm16
+; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm4[2,3]
+; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX512-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-NEXT: vmovdqa64 %ymm2, %ymm31
-; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm28, %xmm1
-; AVX512-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload
-; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm1 = xmm6[0,1],mem[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
-; AVX512-NEXT: vmovdqa64 %ymm27, %ymm3
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm7 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT: vmovdqa64 %ymm30, %ymm6
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
; AVX512-NEXT: vmovdqa64 %ymm26, %ymm6
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-NEXT: vmovdqa64 %ymm25, %ymm6
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
-; AVX512-NEXT: vmovdqa64 %xmm17, %xmm3
-; AVX512-NEXT: vpermt2d %xmm23, %xmm16, %xmm3
-; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm3 = xmm3[0,1],mem[2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
; AVX512-NEXT: vmovdqa64 %ymm21, %ymm6
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
-; AVX512-NEXT: vmovdqa64 %xmm19, %xmm3
-; AVX512-NEXT: vpermt2d %xmm30, %xmm16, %xmm3
-; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm3 = xmm3[0,1],mem[2,3]
-; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
-; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm3 {%k1}
-; AVX512-NEXT: vpermt2d %xmm29, %xmm16, %xmm4
-; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
-; AVX512-NEXT: vmovdqa64 %ymm18, %ymm7
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm1 {%k1}
+; AVX512-NEXT: vmovdqa64 %ymm25, %ymm6
+; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm7 = xmm6[0,1],mem[2,3]
+; AVX512-NEXT: vmovdqa64 %ymm23, %ymm6
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm6
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5],ymm10[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1
+; AVX512-NEXT: vmovdqa64 %ymm22, %ymm6
+; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm7 = xmm6[0,1],mem[2,3]
+; AVX512-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
+; AVX512-NEXT: vmovdqa64 %ymm20, %ymm6
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm7 {%k1}
+; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm16[2],xmm24[2],xmm16[3],xmm24[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-NEXT: vmovdqa64 %ymm31, %ymm4
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512-NEXT: vmovaps %zmm3, 64(%rsi)
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
@@ -13544,8 +13556,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-NEXT: vmovaps %zmm3, (%rax)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax)
-; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vmovaps %zmm0, (%rax)
+; AVX512-NEXT: vmovdqa64 %zmm17, (%rax)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax)
; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
@@ -13555,21 +13566,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-FCP-LABEL: load_i16_stride8_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $2312, %rsp # imm = 0x908
+; AVX512-FCP-NEXT: subq $2440, %rsp # imm = 0x988
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,8]
; AVX512-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512-FCP-NEXT: vmovdqa 336(%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
-; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm0
-; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
+; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm24
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm4
; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm2
@@ -13581,9 +13593,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm30
-; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm18
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
@@ -13607,12 +13619,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm2
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
-; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
+; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
@@ -13630,24 +13642,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20
-; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm5, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm19
-; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25
+; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm4, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm27
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm9
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2
; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
-; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm31
+; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm5
+; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm2
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13659,22 +13672,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm28
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,0,2]
-; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm3
; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,1,0,2]
-; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,1,0,2]
+; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -13684,211 +13697,216 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vmovdqa 864(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-FCP-NEXT: vmovdqa 848(%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vmovdqa 832(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm26
-; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm2
-; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm5, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm28
+; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm9, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm26
; AVX512-FCP-NEXT: vmovdqa 816(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 800(%rdi), %xmm3
-; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512-FCP-NEXT: vmovdqa 800(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-FCP-NEXT: vmovdqa 784(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 768(%rdi), %xmm3
-; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm25
+; AVX512-FCP-NEXT: vmovdqa 768(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm21
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm9
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
; AVX512-FCP-NEXT: vmovdqa 992(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
; AVX512-FCP-NEXT: vmovdqa 928(%rdi), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2]
-; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2]
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 896(%rdi), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm1[0,1,0,2]
-; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5],ymm12[6,7]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2]
+; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm1[5],ymm10[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9 {%k1}
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
; AVX512-FCP-NEXT: vmovdqa 624(%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-FCP-NEXT: vmovdqa 592(%rdi), %xmm0
; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm29
-; AVX512-FCP-NEXT: vpermt2d %xmm11, %xmm2, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm11, %xmm16
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm31
+; AVX512-FCP-NEXT: vpermt2d %ymm10, %ymm9, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm17
; AVX512-FCP-NEXT: vmovdqa 560(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 544(%rdi), %xmm2
-; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT: vmovdqa 544(%rdi), %xmm9
+; AVX512-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
; AVX512-FCP-NEXT: vmovdqa 528(%rdi), %xmm1
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %xmm2
-; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
-; AVX512-FCP-NEXT: vmovdqa64 %xmm11, %xmm21
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %xmm9
+; AVX512-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm22
+; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm16
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm1[0,1],xmm0[2,3]
; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2]
+; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm0[7]
; AVX512-FCP-NEXT: vmovdqa 672(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2]
+; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,0,2]
-; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [1,5,0,0]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
-; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm30
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm24[0],xmm14[1],xmm24[1]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
-; AVX512-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm9 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm11 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
-; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm12 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm12
-; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm9
-; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
-; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm9
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm20[0],xmm19[0],xmm20[1],xmm19[1]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm11
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm9
-; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm11
-; AVX512-FCP-NEXT: vpermt2d %xmm25, %xmm13, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm8
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm26[0],xmm27[0],xmm26[1],xmm27[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm19
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
-; AVX512-FCP-NEXT: vpermt2d %xmm21, %xmm13, %xmm3
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm29[0],xmm16[0],xmm29[1],xmm16[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm4
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm19
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,2,2,3,4,6,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9
+; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [1,9,0,0]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm9
+; AVX512-FCP-NEXT: vpermt2d %ymm29, %ymm10, %ymm9
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm15
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm24[0],xmm11[1],xmm24[1]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3]
+; AVX512-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm10 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
+; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm9 {%k1}
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm10
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
+; AVX512-FCP-NEXT: vpermt2d %ymm18, %ymm15, %ymm10
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm25[0],xmm27[0],xmm25[1],xmm27[1]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm9
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm6
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm21
+; AVX512-FCP-NEXT: vpermt2d %ymm23, %ymm15, %ymm6
+; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm8
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm28[0],xmm8[0],xmm28[1],xmm8[1]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm2
+; AVX512-FCP-NEXT: vpermt2d %ymm16, %ymm15, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm22
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm31[0],xmm17[0],xmm31[1],xmm17[1]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm6
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm0
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3]
+; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm0
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm24[2],xmm11[3],xmm24[3]
; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
-; AVX512-FCP-NEXT: vpermt2d %xmm24, %xmm5, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm28
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm30[2],xmm31[3],xmm30[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,10]
+; AVX512-FCP-NEXT: vpermt2d %ymm24, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm5
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm16[2],xmm29[2],xmm16[3],xmm29[3]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm28
; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
@@ -13896,25 +13914,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm1
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm20[2],xmm2[2],xmm20[3],xmm2[3]
-; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm1
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm17[2],xmm12[3],xmm17[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm25
-; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm23
+; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm1
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm25[2],xmm2[2],xmm25[3],xmm2[3]
+; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm5, %ymm1
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm18[2],xmm13[3],xmm18[3]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm25
+; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm23
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm18
-; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd $231, (%rsp), %ymm14 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm14 = mem[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
@@ -13923,20 +13941,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm17 = xmm19[2],xmm8[2],xmm19[3],xmm8[3]
-; AVX512-FCP-NEXT: vpermt2d %xmm8, %xmm5, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm19
-; AVX512-FCP-NEXT: vmovdqa64 %xmm11, %xmm20
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm2
+; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm0
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm1
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm17 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT: vpermt2d %ymm8, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm21[2],xmm9[3],xmm21[3]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm19
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm24
-; AVX512-FCP-NEXT: vpshufd $212, (%rsp), %ymm9 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7]
@@ -13947,10 +13966,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 {%k1}
-; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm0
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm6[2],xmm16[2],xmm6[3],xmm16[3]
-; AVX512-FCP-NEXT: vpermt2d %xmm16, %xmm5, %xmm0
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm21[2],xmm22[3],xmm21[3]
+; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm0
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm6[2],xmm15[2],xmm6[3],xmm15[3]
+; AVX512-FCP-NEXT: vpermt2d %ymm15, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm20
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm20[2],xmm22[3],xmm20[3]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7]
@@ -13970,55 +13991,56 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
-; AVX512-FCP-NEXT: vpermt2d %xmm28, %xmm16, %xmm1
-; AVX512-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm0 = xmm1[0,1],mem[2,3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,11,3,3]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
+; AVX512-FCP-NEXT: vpermt2d %ymm29, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm16
+; AVX512-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm1
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm3
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm15
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm1
-; AVX512-FCP-NEXT: vpermt2d %xmm23, %xmm16, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm1
+; AVX512-FCP-NEXT: vpermt2d %ymm25, %ymm16, %ymm1
; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm3
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
-; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm16, %xmm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512-FCP-NEXT: vpermt2d %ymm21, %ymm16, %ymm0
; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm1
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
-; AVX512-FCP-NEXT: vpermt2d %xmm21, %xmm16, %xmm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
+; AVX512-FCP-NEXT: vpermt2d %ymm20, %ymm16, %ymm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
@@ -14033,21 +14055,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
-; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm29
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm19
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,8]
+; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm18
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm24
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
@@ -14070,7 +14093,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3]
; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -14085,18 +14108,19 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm27
-; AVX512-FCP-NEXT: vpermt2d %xmm6, %xmm5, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm18
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm22
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm22
+; AVX512-FCP-NEXT: vpermt2d %ymm6, %ymm5, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm31
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm27
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm16
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3]
@@ -14104,12 +14128,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3]
; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm26
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3]
@@ -14133,29 +14156,29 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm31
-; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm21
-; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm25
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm21
+; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm23
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm13
; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3]
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3]
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
@@ -14171,7 +14194,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -14179,20 +14202,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm1
-; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm16
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm28
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm30
-; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1
+; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm28
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm29
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm25
+; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm1[2,3]
; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3]
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -14201,8 +14225,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,0,4,5,6,4]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm17
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3]
@@ -14213,41 +14238,42 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
-; AVX512-FCP-NEXT: vpermt2d %xmm19, %xmm15, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,9,0,0]
+; AVX512-FCP-NEXT: vpermt2d %ymm9, %ymm15, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm24
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm18[0],xmm9[1],xmm18[1]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT: vpshuflw $212, (%rsp), %ymm13 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
-; AVX512-FCP-NEXT: vpermt2d %xmm24, %xmm15, %xmm1
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm27[0],xmm18[0],xmm27[1],xmm18[1]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
-; AVX512-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm1
+; AVX512-FCP-NEXT: vpermt2d %ymm27, %ymm15, %ymm1
+; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm15
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm22[0],xmm15[0],xmm22[1],xmm15[1]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm13
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
@@ -14255,13 +14281,14 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm0
-; AVX512-FCP-NEXT: vpermt2d %xmm25, %xmm15, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm13
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm31[0],xmm20[0],xmm31[1],xmm20[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm21
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm12
+; AVX512-FCP-NEXT: vpermt2d %ymm30, %ymm14, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm13
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm21[0],xmm13[0],xmm21[1],xmm13[1]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -14270,12 +14297,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm6
-; AVX512-FCP-NEXT: vpermi2d %xmm28, %xmm30, %xmm6
-; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm16[0],xmm17[1],xmm16[1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm7
+; AVX512-FCP-NEXT: vmovdqa %ymm14, %ymm6
+; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm25, %ymm6
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm20
+; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm28[0],xmm29[0],xmm28[1],xmm29[1]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm4
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -14285,27 +14313,28 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3]
-; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
-; AVX512-FCP-NEXT: vpermt2d %xmm29, %xmm5, %xmm0
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3]
+; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm28 = xmm9[2],xmm18[2],xmm9[3],xmm18[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,10]
+; AVX512-FCP-NEXT: vpermt2d %ymm18, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm6
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25
; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm31
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm24
; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
@@ -14313,135 +14342,141 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm20 = xmm27[2],xmm18[2],xmm27[3],xmm18[3]
-; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm5, %xmm1
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm19
-; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm18
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
+; AVX512-FCP-NEXT: vmovdqa %ymm15, %ymm2
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm22[2],xmm2[2],xmm22[3],xmm2[3]
+; AVX512-FCP-NEXT: vpermt2d %ymm15, %ymm5, %ymm1
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm16
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm16[2],xmm9[3],xmm16[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm27
+; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm19
; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm0
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm21[2],xmm13[2],xmm21[3],xmm13[3]
-; AVX512-FCP-NEXT: vpermt2d %xmm13, %xmm5, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm22
-; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm24[2],xmm25[2],xmm24[3],xmm25[3]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm1
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm18 = xmm21[2],xmm1[2],xmm21[3],xmm1[3]
+; AVX512-FCP-NEXT: vpermt2d %ymm13, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm21
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm21[2],xmm30[2],xmm21[3],xmm30[3]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm21
-; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 {%k1}
-; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm25
-; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm27
-; AVX512-FCP-NEXT: vpermi2d %xmm7, %xmm17, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm29[2],xmm28[2],xmm29[3],xmm28[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm14 = mem[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4],ymm2[5],ymm13[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm17
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1}
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,11,3,3]
+; AVX512-FCP-NEXT: vpermt2d %ymm6, %ymm4, %ymm11
+; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm6
+; AVX512-FCP-NEXT: vpermt2d %ymm16, %ymm4, %ymm9
+; AVX512-FCP-NEXT: vpermt2d %ymm30, %ymm4, %ymm21
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm30
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT: vpermi2d %ymm20, %ymm1, %ymm4
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload
+; AVX512-FCP-NEXT: vpermi2d %ymm20, %ymm21, %ymm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
-; AVX512-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm1 = xmm0[0,1],mem[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16
+; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm0[2,3]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm0
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
-; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm17, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm3
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm6
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm6
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm1
-; AVX512-FCP-NEXT: vpermt2d %xmm22, %xmm17, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm4
-; AVX512-FCP-NEXT: vpermt2d %xmm28, %xmm17, %xmm4
-; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm27[2],xmm25[2],xmm27[3],xmm25[3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
+; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm21[2],xmm20[2],xmm21[3],xmm20[3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rsi)
@@ -14469,52 +14504,52 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm2, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, (%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512-FCP-NEXT: addq $2312, %rsp # imm = 0x908
+; AVX512-FCP-NEXT: addq $2440, %rsp # imm = 0x988
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i16_stride8_vf64:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: subq $2408, %rsp # imm = 0x968
-; AVX512DQ-NEXT: vmovdqa 368(%rdi), %xmm1
-; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa 352(%rdi), %xmm0
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,8]
+; AVX512DQ-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-NEXT: vmovdqa 336(%rdi), %xmm2
-; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm1
-; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,0,0,4]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX512DQ-NEXT: vmovdqa 352(%rdi), %xmm2
; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
-; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-NEXT: vmovdqa 336(%rdi), %xmm0
+; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm2
; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm6, %xmm1
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
+; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa %ymm1, %ymm5
+; AVX512DQ-NEXT: vpermt2d %ymm3, %ymm1, %ymm4
; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm2
-; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm2
-; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm31
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm31
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqa 480(%rdi), %ymm1
; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -14537,12 +14572,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm2
; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
-; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
@@ -14567,7 +14602,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpermt2d %xmm1, %xmm6, %xmm2
+; AVX512DQ-NEXT: vpermt2d %ymm1, %ymm5, %ymm2
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm6
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3
@@ -14577,10 +14613,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm5
; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm17
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm17
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm2
; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -14627,11 +14663,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm6, %xmm1
-; AVX512DQ-NEXT: vmovdqa %xmm6, %xmm10
+; AVX512DQ-NEXT: vpermt2d %ymm0, %ymm6, %ymm1
+; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm10
; AVX512DQ-NEXT: vmovdqa 816(%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vmovdqa 800(%rdi), %xmm2
@@ -14643,8 +14679,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm19
-; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm20
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm19
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm20
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqa 992(%rdi), %ymm1
@@ -14686,11 +14722,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3]
-; AVX512DQ-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpermt2d %xmm1, %xmm10, %xmm2
+; AVX512DQ-NEXT: vpermt2d %ymm1, %ymm10, %ymm2
; AVX512DQ-NEXT: vmovdqa 560(%rdi), %xmm1
; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vmovdqa 544(%rdi), %xmm5
@@ -14705,12 +14741,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm2[2,3]
; AVX512DQ-NEXT: vmovdqa 736(%rdi), %ymm1
; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm1[0,1,0,2]
; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm1
; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[0,1,2,0,4,5,6,4]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
@@ -14720,10 +14756,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm5[0,1,0,2]
; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm5
; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm5[0,1,0,2]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm5[0,1,0,2]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm28[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm30[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm29[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5],ymm5[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm13[6,7]
@@ -14731,7 +14767,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm31[1,1,1,1]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
@@ -14759,7 +14795,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3]
; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm9 = xmm9[0,1],mem[2,3]
@@ -14767,9 +14803,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[1,1,1,1]
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm9
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm9
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3]
-; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT: vpblendd $12, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
@@ -14781,8 +14817,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
-; AVX512DQ-NEXT: vmovdqa %xmm8, %xmm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3]
; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm4
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
@@ -14796,7 +14832,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm14[2],xmm31[3],xmm14[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm16
+; AVX512DQ-NEXT: vmovdqa64 %ymm14, %ymm16
; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
@@ -14835,14 +14871,14 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm13[2],xmm17[3],xmm13[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm22
+; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm22
; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm20[2],xmm19[3],xmm20[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm24
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm9[2],xmm19[3],xmm9[3]
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm24
; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20
@@ -14862,10 +14898,10 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm5[2],xmm15[3],xmm5[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm18
-; AVX512DQ-NEXT: vpblendd $12, (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload
+; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm18
+; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm3 = xmm0[0,1],mem[2,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm30[0,1,1,3,4,5,5,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
@@ -14873,17 +14909,17 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[3,1,2,3,7,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[3,1,2,3,7,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm1
-; AVX512DQ-NEXT: vpermt2d %xmm16, %xmm0, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,11,0,0]
+; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm1
+; AVX512DQ-NEXT: vpermt2d %ymm16, %ymm0, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm16
; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm0 = xmm1[0,1],mem[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
@@ -14899,8 +14935,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm1
-; AVX512DQ-NEXT: vpermt2d %xmm22, %xmm16, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm1
+; AVX512DQ-NEXT: vpermt2d %ymm22, %ymm16, %ymm1
; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm3
@@ -14914,8 +14950,8 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm0
-; AVX512DQ-NEXT: vpermt2d %xmm24, %xmm16, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-NEXT: vpermt2d %ymm24, %ymm16, %ymm0
; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
@@ -14927,7 +14963,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT: vpermt2d %xmm18, %xmm16, %xmm15
+; AVX512DQ-NEXT: vpermt2d %ymm18, %ymm16, %ymm15
; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm1 = xmm15[0,1],mem[2,3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -14944,27 +14980,26 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
-; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,0,0,4]
-; AVX512DQ-NEXT: vpermt2d %xmm5, %xmm10, %xmm6
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,2,2,2]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm28
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,0,0,8]
+; AVX512DQ-NEXT: vpermt2d %ymm5, %ymm6, %ymm0
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm11 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3]
@@ -15007,16 +15042,16 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpermt2d %xmm1, %xmm10, %xmm2
+; AVX512DQ-NEXT: vpermt2d %ymm1, %ymm6, %ymm2
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm19
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm22
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3]
@@ -15027,22 +15062,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm30 = mem[0,1,1,3]
-; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm4 = mem[0,1,1,3]
-; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm3 = mem[0,1,1,3]
+; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3]
+; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm17
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5],ymm8[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -15051,307 +15089,307 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,2,2,2]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3]
+; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm10, %xmm1
+; AVX512DQ-NEXT: vpermt2d %ymm0, %ymm6, %ymm1
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm16
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm20
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1]
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm20
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
-; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3]
-; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3]
-; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm22 = mem[0,1,1,3]
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm5
+; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm28 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm28 = mem[0,1,1,3]
; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm29 = mem[0,1,1,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm22[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm28[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm27 = mem[0,1,1,3]
+; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm25 = mem[0,1,1,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm27[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm5 {%k1}
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm1
-; AVX512DQ-NEXT: vpermi2d %xmm2, %xmm8, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm19
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm24
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
-; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm31
-; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm21
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm14 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm0
+; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm2, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm16
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm24
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm31
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3]
; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3]
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,0,4,5,6,4]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3,4,5,6],ymm8[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm1[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[0,1,2,0,4,5,6,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm15[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3]
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm10 = mem[0,1,1,3]
-; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,2,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3]
+; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm5, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm28[1,1,1,1]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
-; AVX512DQ-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm13
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7]
-; AVX512DQ-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2,3]
-; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm9 = xmm9[0,1],mem[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm4 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm5 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512DQ-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm4 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm4 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm5
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm22[1,1,1,1]
+; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm17
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3]
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm22
+; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm4 = xmm4[0,1],mem[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm16[1,1,1,1]
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm7
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm20[1,1,1,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3]
+; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1]
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm6
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm5
-; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm31[1,1,1,1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3]
+; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm12
+; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm7
+; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm24[0],xmm7[1],xmm24[1]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm15[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm28[2],xmm10[2],xmm28[3],xmm10[3]
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
+; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm14
+; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm6
; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm27
+; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm26
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm30
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm25
-; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm26
+; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm21
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21
-; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm19
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm15[2],xmm17[3],xmm15[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm23
-; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm19
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm30
-; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20
-; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm24[2,2,2,2]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm22
-; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm4
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm6[2],xmm31[3],xmm6[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm29
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm18
+; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm11
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm11[2],xmm22[2],xmm11[3],xmm22[3]
+; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm17
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm16
+; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm18
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm18[2],xmm16[3],xmm18[3]
+; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm28[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm25[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm15
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm15 {%k1}
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [3,11,0,0]
+; AVX512DQ-NEXT: vpermt2d %ymm14, %ymm2, %ymm6
+; AVX512DQ-NEXT: vpermt2d %ymm22, %ymm2, %ymm11
+; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm25
+; AVX512DQ-NEXT: vpermt2d %ymm18, %ymm2, %ymm16
+; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm22
+; AVX512DQ-NEXT: vpermi2d %ymm12, %ymm31, %ymm2
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm12[2],xmm31[3],xmm12[3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm24[2,2,2,2]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3]
+; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm16
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],xmm4[2,3]
+; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm31
-; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm1
-; AVX512DQ-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm1 = xmm6[0,1],mem[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm3
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm7 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm30, %ymm6
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm6
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm6
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
-; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm3
-; AVX512DQ-NEXT: vpermt2d %xmm23, %xmm16, %xmm3
-; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm3 = xmm3[0,1],mem[2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm6
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm3
-; AVX512DQ-NEXT: vpermt2d %xmm30, %xmm16, %xmm3
-; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm3 = xmm3[0,1],mem[2,3]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm3 {%k1}
-; AVX512DQ-NEXT: vpermt2d %xmm29, %xmm16, %xmm4
-; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
-; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm7
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm1 {%k1}
+; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm6
+; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm7 = xmm6[0,1],mem[2,3]
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm6
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm14 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm6
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5],ymm10[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm1, %zmm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm6
+; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm7 = xmm6[0,1],mem[2,3]
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm6
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm7 {%k1}
+; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm16[2],xmm24[2],xmm16[3],xmm24[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm4
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5],ymm2[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm7, %zmm2
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512DQ-NEXT: vmovaps %zmm3, 64(%rsi)
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
@@ -15379,8 +15417,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-NEXT: vmovaps %zmm3, (%rax)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rax)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax)
@@ -15390,21 +15427,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-FCP-LABEL: load_i16_stride8_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $2312, %rsp # imm = 0x908
+; AVX512DQ-FCP-NEXT: subq $2440, %rsp # imm = 0x988
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,8]
; AVX512DQ-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512DQ-FCP-NEXT: vmovdqa 336(%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm0
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
+; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm24
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm4
; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm2
@@ -15416,9 +15454,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm30
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm18
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm29
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
@@ -15442,12 +15480,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
@@ -15465,24 +15503,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm19
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm27
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm9
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm31
+; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -15494,22 +15533,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm28
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm3
; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm5[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -15519,211 +15558,216 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 864(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512DQ-FCP-NEXT: vmovdqa 848(%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 832(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm26
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm2
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm28
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm26
; AVX512DQ-FCP-NEXT: vmovdqa 816(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 800(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vmovdqa 800(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-FCP-NEXT: vmovdqa 784(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 768(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm25
+; AVX512DQ-FCP-NEXT: vmovdqa 768(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm21
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm9
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
; AVX512DQ-FCP-NEXT: vmovdqa 992(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
; AVX512DQ-FCP-NEXT: vmovdqa 928(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 896(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm1[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5],ymm12[6,7]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm1[5],ymm10[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9 {%k1}
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa 624(%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512DQ-FCP-NEXT: vmovdqa 592(%rdi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm29
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm11, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm11, %xmm16
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm31
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm10, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm17
; AVX512DQ-FCP-NEXT: vmovdqa 560(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 544(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vmovdqa 544(%rdi), %xmm9
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
; AVX512DQ-FCP-NEXT: vmovdqa 528(%rdi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm11, %xmm21
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %xmm9
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm22
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm16
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-FCP-NEXT: vmovdqa 736(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm0[7]
; AVX512DQ-FCP-NEXT: vmovdqa 672(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm0[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,0,2]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm30
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm24[0],xmm14[1],xmm24[1]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm12 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm9
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm20[0],xmm19[0],xmm20[1],xmm19[1]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm11
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm25, %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm8
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm26[0],xmm27[0],xmm26[1],xmm27[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm19
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm21, %xmm13, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm29[0],xmm16[0],xmm29[1],xmm16[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm19
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,2,2,3,4,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm14[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [1,9,0,0]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm9
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm29, %ymm10, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm24[0],xmm11[1],xmm24[1]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3]
+; AVX512DQ-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm0, %zmm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm9 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm18, %ymm15, %ymm10
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm25[0],xmm27[0],xmm25[1],xmm27[1]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm12[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm21
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm23, %ymm15, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm8
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm28[0],xmm8[0],xmm28[1],xmm8[1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm16, %ymm15, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm22
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm31[0],xmm17[0],xmm31[1],xmm17[1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm0
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm24[2],xmm11[3],xmm24[3]
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm24, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm28
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm30[2],xmm31[3],xmm30[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,10]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm24, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm16[2],xmm29[2],xmm16[3],xmm29[3]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm29
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm28
; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
@@ -15731,25 +15775,25 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm20[2],xmm2[2],xmm20[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm17[2],xmm12[3],xmm17[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm25
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm23
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm25[2],xmm2[2],xmm25[3],xmm2[3]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm13[2],xmm18[2],xmm13[3],xmm18[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm25
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm23
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm18
-; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd $231, (%rsp), %ymm14 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm14 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
@@ -15758,20 +15802,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm17 = xmm19[2],xmm8[2],xmm19[3],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm8, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm19
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm11, %xmm20
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm1
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm17 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm8, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm21[2],xmm9[3],xmm21[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm19
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm24
-; AVX512DQ-FCP-NEXT: vpshufd $212, (%rsp), %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7]
@@ -15782,10 +15827,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm6[2],xmm16[2],xmm6[3],xmm16[3]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm16, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm21[2],xmm22[3],xmm21[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm0
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm6[2],xmm15[2],xmm6[3],xmm15[3]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm15, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm20
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm20[2],xmm22[3],xmm20[3]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7]
@@ -15805,55 +15852,56 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm28, %xmm16, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm0 = xmm1[0,1],mem[2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [3,11,3,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm29, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm3
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm15
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm1
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm23, %xmm16, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm25, %ymm16, %ymm1
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm3
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm16, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm21, %ymm16, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm1
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm21, %xmm16, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm20, %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
@@ -15868,21 +15916,22 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm29
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm19
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,8]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm18
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm9[0],xmm2[1],xmm9[1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm24
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
@@ -15905,7 +15954,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3]
; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -15920,18 +15969,19 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm27
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm6, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm18
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm24
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm22
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm6, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm31
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm27
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm16
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3]
@@ -15939,12 +15989,11 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3]
; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm26
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3]
@@ -15968,29 +16017,29 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm31
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm21
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm25
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm21
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm23
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm13
; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,1,3]
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3]
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm0[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
@@ -16006,7 +16055,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -16014,20 +16063,21 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm16
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm13, %xmm28
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm30
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm14 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm28
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm29
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm25
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm0[0,1],xmm1[2,3]
; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,1,3]
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -16036,8 +16086,9 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,0,4,5,6,4]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,0,4,5,6,4]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm17
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,1,3]
@@ -16048,41 +16099,42 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm19, %xmm15, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,9,0,0]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm9, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm24
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm18[0],xmm9[1],xmm18[1]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
-; AVX512DQ-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vpshuflw $212, (%rsp), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm24, %xmm15, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm27[0],xmm18[0],xmm27[1],xmm18[1]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm27, %ymm15, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm15
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm22[0],xmm15[0],xmm22[1],xmm15[1]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
@@ -16090,13 +16142,14 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm0
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm25, %xmm15, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm13
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm31[0],xmm20[0],xmm31[1],xmm20[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm21
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm12
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm30, %ymm14, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm13
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm21[0],xmm13[0],xmm21[1],xmm13[1]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -16105,12 +16158,13 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm6
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm28, %xmm30, %xmm6
-; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm16[0],xmm17[1],xmm16[1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm14, %ymm6
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm25, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm20
+; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm28[0],xmm29[0],xmm28[1],xmm29[1]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm4
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -16120,27 +16174,28 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm29, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm28 = xmm9[2],xmm18[2],xmm9[3],xmm18[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,10]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm18, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm6
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm6[2],xmm11[3],xmm6[3]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25
; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm31
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm24
; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
@@ -16148,135 +16203,141 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm20 = xmm27[2],xmm18[2],xmm27[3],xmm18[3]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm19
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm18
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, %ymm2
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm22[2],xmm2[2],xmm22[3],xmm2[3]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm15, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm16
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm16[2],xmm9[3],xmm16[3]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm27
+; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm19
; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm0
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm21[2],xmm13[2],xmm21[3],xmm13[3]
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm13, %xmm5, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm24[2],xmm25[2],xmm24[3],xmm25[3]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm29
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm18 = xmm21[2],xmm1[2],xmm21[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm13, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm21
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm21[2],xmm30[2],xmm21[3],xmm30[3]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm21
-; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm25
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm27
-; AVX512DQ-FCP-NEXT: vpermi2d %xmm7, %xmm17, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm29[2],xmm28[2],xmm29[3],xmm28[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
-; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm14 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4],ymm2[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm17
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1}
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [3,11,3,3]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm6, %ymm4, %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm6
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm16, %ymm4, %ymm9
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm30, %ymm4, %ymm21
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm30
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm20, %ymm1, %ymm4
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm20, %ymm21, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
+; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm1 = xmm0[0,1],mem[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm0[2,3]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm17, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm6
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm1
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm22, %xmm17, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm4
-; AVX512DQ-FCP-NEXT: vpermt2d %xmm28, %xmm17, %xmm4
-; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm27[2],xmm25[2],xmm27[3],xmm25[3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm21[2],xmm20[2],xmm21[3],xmm20[3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rsi)
@@ -16304,13 +16365,12 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512DQ-FCP-NEXT: addq $2312, %rsp # imm = 0x908
+; AVX512DQ-FCP-NEXT: addq $2440, %rsp # imm = 0x988
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index 34f23213500c1..c41e306630c0e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -228,20 +228,19 @@ define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX-LABEL: load_i32_stride3_vf4:
; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rdi), %xmm0
-; AVX-NEXT: vmovaps 16(%rdi), %xmm1
-; AVX-NEXT: vmovaps 32(%rdi), %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,1]
-; AVX-NEXT: vmovaps 32(%rdi), %xmm4
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1,2],xmm1[3]
+; AVX-NEXT: vmovaps 32(%rdi), %xmm0
+; AVX-NEXT: vmovaps (%rdi), %xmm1
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT: vmovaps 16(%rdi), %xmm3
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0],xmm4[1,2],xmm3[3]
; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0,3,2]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; AVX-NEXT: vmovaps %xmm3, (%rsi)
+; AVX-NEXT: vmovaps %xmm2, (%rsi)
; AVX-NEXT: vmovaps %xmm4, (%rdx)
; AVX-NEXT: vmovaps %xmm0, (%rcx)
; AVX-NEXT: retq
@@ -477,8 +476,8 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovaps 32(%rdi), %ymm1
; AVX-NEXT: vmovaps (%rdi), %ymm2
; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX-NEXT: vmovaps 16(%rdi), %xmm4
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7]
+; AVX-NEXT: vextractf128 $1, %ymm3, %xmm4
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0],mem[1,0]
; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
@@ -818,16 +817,16 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps 32(%rdi), %ymm4
; AVX-NEXT: vmovaps (%rdi), %ymm6
; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7]
-; AVX-NEXT: vmovaps 16(%rdi), %xmm7
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1],ymm4[1,3],ymm7[6,5],ymm4[5,7]
+; AVX-NEXT: vextractf128 $1, %ymm5, %xmm7
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,0],mem[1,0]
; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm7[0,2],ymm5[4,7],ymm7[4,6]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[2,0],ymm3[5,4],ymm7[6,4]
; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7]
-; AVX-NEXT: vmovaps 112(%rdi), %xmm9
-; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm1[1,3],ymm9[6,5],ymm1[5,7]
+; AVX-NEXT: vextractf128 $1, %ymm8, %xmm9
+; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,0],mem[1,0]
; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,3],ymm9[0,2],ymm8[4,7],ymm9[4,6]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm9[2,0],ymm0[5,4],ymm9[6,4]
@@ -1406,135 +1405,139 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX-LABEL: load_i32_stride3_vf32:
; AVX: # %bb.0:
-; AVX-NEXT: subq $392, %rsp # imm = 0x188
+; AVX-NEXT: subq $456, %rsp # imm = 0x1C8
; AVX-NEXT: vmovaps 256(%rdi), %ymm2
-; AVX-NEXT: vmovaps 224(%rdi), %ymm7
-; AVX-NEXT: vmovaps 192(%rdi), %ymm3
+; AVX-NEXT: vmovaps 224(%rdi), %ymm3
; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 352(%rdi), %ymm4
-; AVX-NEXT: vmovaps 320(%rdi), %ymm5
-; AVX-NEXT: vmovaps 288(%rdi), %ymm6
-; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 160(%rdi), %ymm10
-; AVX-NEXT: vmovaps 128(%rdi), %ymm9
-; AVX-NEXT: vmovaps 96(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7]
-; AVX-NEXT: vmovaps 112(%rdi), %xmm1
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7]
+; AVX-NEXT: vmovaps 192(%rdi), %ymm4
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 352(%rdi), %ymm6
+; AVX-NEXT: vmovaps 320(%rdi), %ymm9
+; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 288(%rdi), %ymm8
+; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 160(%rdi), %ymm11
+; AVX-NEXT: vmovaps 128(%rdi), %ymm5
+; AVX-NEXT: vmovaps 96(%rdi), %ymm10
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6],ymm5[7]
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm10[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4]
-; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm11[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm7[2,0],ymm11[5,4],ymm7[6,4]
+; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7]
-; AVX-NEXT: vmovaps 304(%rdi), %xmm1
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm5[1,3],ymm1[6,5],ymm5[5,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4]
-; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm6[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm14[2,0],ymm6[5,4],ymm14[6,4]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5,6],ymm7[7]
-; AVX-NEXT: vmovaps 208(%rdi), %xmm1
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm7[1,3],ymm1[6,5],ymm7[5,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps %ymm2, %ymm4
; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 32(%rdi), %ymm15
-; AVX-NEXT: vmovaps 16(%rdi), %xmm0
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7]
-; AVX-NEXT: vmovaps (%rdi), %ymm2
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7]
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
-; AVX-NEXT: vmovaps 64(%rdi), %ymm7
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,0],ymm0[2,0],ymm7[5,4],ymm0[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm8[6,7]
+; AVX-NEXT: vmovaps (%rdi), %ymm3
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm15[1],ymm3[2,3],ymm15[4],ymm3[5,6],ymm15[7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
+; AVX-NEXT: vmovaps 64(%rdi), %ymm9
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm9[1,0],ymm0[2,0],ymm9[5,4],ymm0[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm13[6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups %ymm11, (%rsp) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm11[3,0],ymm10[6,4],ymm11[7,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,0],ymm8[2,0],ymm11[4,4],ymm8[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm11[2,0],ymm7[3,0],ymm11[6,4],ymm7[7,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,0],ymm2[2,0],ymm7[4,4],ymm2[6,4]
; AVX-NEXT: vmovaps 112(%rdi), %xmm13
-; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT: # ymm6 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm13[0,3],ymm6[5,6],ymm13[4,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm5[2],ymm10[3,4],ymm5[5],ymm10[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,2],ymm13[0,3],ymm8[5,6],ymm13[4,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,3,1,4,6,7,5]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm2[5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm4[2,0],ymm14[3,0],ymm4[6,4],ymm14[7,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0],ymm6[2,0],ymm14[4,4],ymm6[6,4]
-; AVX-NEXT: vmovaps 304(%rdi), %xmm8
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm8[0,3],ymm10[5,6],ymm8[4,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm14[3,0],ymm6[6,4],ymm14[7,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,0],ymm2[2,0],ymm14[4,4],ymm2[6,4]
+; AVX-NEXT: vmovaps 304(%rdi), %xmm11
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm11[0,3],ymm10[5,6],ymm11[4,7]
; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm6[5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm0[3,0],ymm7[6,4],ymm0[7,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm6[2,0],ymm0[4,4],ymm6[6,4]
-; AVX-NEXT: vmovaps 16(%rdi), %xmm11
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,2],ymm11[0,3],ymm4[5,6],ymm11[4,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm6[5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm2[5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,0],ymm12[3,0],ymm1[6,4],ymm12[7,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,0],ymm4[2,0],ymm12[4,4],ymm4[6,4]
-; AVX-NEXT: vmovaps 208(%rdi), %xmm10
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm9[2,0],ymm0[3,0],ymm9[6,4],ymm0[7,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm2[2,0],ymm0[4,4],ymm2[6,4]
+; AVX-NEXT: vmovaps 16(%rdi), %xmm10
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1],ymm15[2],ymm3[3,4],ymm15[5],ymm3[6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7]
; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7]
-; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT: # ymm6 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm13[1,0],ymm6[2,0],ymm13[5,4],ymm6[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm9[0,3],ymm6[6,4],ymm9[4,7]
-; AVX-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload
-; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT: # ymm9 = ymm9[0,1],mem[0,3],ymm9[4,5],mem[4,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm8[2,0],ymm5[0,3],ymm8[6,4],ymm5[4,7]
-; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT: # ymm8 = ymm14[0,1],mem[0,3],ymm14[4,5],mem[4,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm12[3,0],ymm4[6,4],ymm12[7,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,0],ymm2[2,0],ymm12[4,4],ymm2[6,4]
+; AVX-NEXT: vmovaps 208(%rdi), %xmm6
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,2],ymm6[0,3],ymm4[5,6],ymm6[4,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vblendps $36, (%rsp), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT: # ymm4 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm4[2,0],ymm13[5,4],ymm4[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm3[0,3],ymm4[6,4],ymm3[4,7]
+; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT: # ymm7 = ymm7[0,1],mem[0,3],ymm7[4,5],mem[4,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,0],ymm7[2,0],ymm11[5,4],ymm7[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,0],ymm5[0,3],ymm7[6,4],ymm5[4,7]
+; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT: # ymm7 = ymm14[0,1],mem[0,3],ymm14[4,5],mem[4,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5,6,7]
; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
; AVX-NEXT: # ymm3 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm3[2,0],ymm11[5,4],ymm3[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm3[2,0],ymm10[5,4],ymm3[6,4]
; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm15[0,3],ymm3[6,4],ymm15[4,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm7[0,3],ymm0[4,5],ymm7[4,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm9[0,3],ymm0[4,5],ymm9[4,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm1[2,0],ymm10[5,4],ymm1[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,3],ymm1[6,4],ymm2[4,7]
-; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT: # ymm2 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT: vmovaps %ymm1, %ymm3
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm1[2,0],ymm6[5,4],ymm1[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[0,3],ymm1[6,4],ymm3[4,7]
+; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm2, (%rsi)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm2, 64(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 96(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 32(%rsi)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm2, 96(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm2, 32(%rsi)
-; AVX-NEXT: vmovaps %ymm4, 64(%rdx)
+; AVX-NEXT: vmovaps %ymm2, 64(%rdx)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm2, (%rdx)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -1544,8 +1547,8 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %ymm1, 64(%rcx)
; AVX-NEXT: vmovaps %ymm0, (%rcx)
; AVX-NEXT: vmovaps %ymm5, 96(%rcx)
-; AVX-NEXT: vmovaps %ymm6, 32(%rcx)
-; AVX-NEXT: addq $392, %rsp # imm = 0x188
+; AVX-NEXT: vmovaps %ymm4, 32(%rcx)
+; AVX-NEXT: addq $456, %rsp # imm = 0x1C8
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -2575,37 +2578,37 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX-LABEL: load_i32_stride3_vf64:
; AVX: # %bb.0:
-; AVX-NEXT: subq $1384, %rsp # imm = 0x568
+; AVX-NEXT: subq $1512, %rsp # imm = 0x5E8
; AVX-NEXT: vmovaps 544(%rdi), %ymm2
; AVX-NEXT: vmovaps 512(%rdi), %ymm3
; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 480(%rdi), %ymm4
; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 352(%rdi), %ymm5
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill
; AVX-NEXT: vmovaps 320(%rdi), %ymm6
; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 288(%rdi), %ymm7
; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 160(%rdi), %ymm8
; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 128(%rdi), %ymm9
-; AVX-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill
+; AVX-NEXT: vmovaps 128(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 96(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7]
-; AVX-NEXT: vmovaps 112(%rdi), %xmm1
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm1[2,0],ymm8[5,4],ymm1[6,4]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4]
+; AVX-NEXT: vmovaps %ymm9, %ymm8
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7]
-; AVX-NEXT: vmovaps 304(%rdi), %xmm1
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm6[1,3],ymm1[6,5],ymm6[5,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm14[2,0],ymm5[5,4],ymm14[6,4]
@@ -2613,8 +2616,8 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7]
-; AVX-NEXT: vmovaps 496(%rdi), %xmm1
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm3[1,3],ymm1[6,5],ymm3[5,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1]
@@ -2622,102 +2625,104 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 704(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 688(%rdi), %xmm0
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm2[1,3],ymm0[6,5],ymm2[5,7]
+; AVX-NEXT: vmovaps 704(%rdi), %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 672(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
-; AVX-NEXT: vmovaps 736(%rdi), %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
+; AVX-NEXT: vmovaps 736(%rdi), %ymm6
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 32(%rdi), %ymm7
-; AVX-NEXT: vmovaps 16(%rdi), %xmm0
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7]
-; AVX-NEXT: vmovaps (%rdi), %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5,6],ymm7[7]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
-; AVX-NEXT: vmovaps 64(%rdi), %ymm4
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1]
+; AVX-NEXT: vmovaps 32(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4]
-; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps (%rdi), %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
+; AVX-NEXT: vmovaps 64(%rdi), %ymm11
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm11[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm10[2,0],ymm11[5,4],ymm10[6,4]
+; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 224(%rdi), %ymm6
-; AVX-NEXT: vmovaps 208(%rdi), %xmm0
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[1,3],ymm0[6,5],ymm6[5,7]
-; AVX-NEXT: vmovaps 192(%rdi), %ymm1
+; AVX-NEXT: vmovaps 224(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
-; AVX-NEXT: vmovaps 256(%rdi), %ymm5
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm1[2,0],ymm5[5,4],ymm1[6,4]
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 192(%rdi), %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
+; AVX-NEXT: vmovaps 256(%rdi), %ymm7
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm7[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,0],ymm5[2,0],ymm7[5,4],ymm5[6,4]
+; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 416(%rdi), %ymm12
-; AVX-NEXT: vmovaps 400(%rdi), %xmm0
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7]
-; AVX-NEXT: vmovaps 384(%rdi), %ymm1
+; AVX-NEXT: vmovaps 416(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
-; AVX-NEXT: vmovaps 448(%rdi), %ymm8
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 384(%rdi), %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
+; AVX-NEXT: vmovaps 448(%rdi), %ymm12
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm12[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm9[2,0],ymm12[5,4],ymm9[6,4]
+; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 608(%rdi), %ymm10
-; AVX-NEXT: vmovaps 592(%rdi), %xmm0
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7]
-; AVX-NEXT: vmovaps 576(%rdi), %ymm1
+; AVX-NEXT: vmovaps 608(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6]
-; AVX-NEXT: vmovaps 640(%rdi), %ymm13
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm13[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm11[2,0],ymm13[5,4],ymm11[6,4]
-; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 576(%rdi), %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6]
+; AVX-NEXT: vmovaps 640(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm13[2,0],ymm1[5,4],ymm13[6,4]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
-; AVX-NEXT: vmovaps 112(%rdi), %xmm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[3,0],ymm0[6,4],ymm8[7,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm8[0,0],ymm0[2,0],ymm8[4,4],ymm0[6,4]
+; AVX-NEXT: vmovaps 112(%rdi), %xmm4
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm0[0,3],ymm1[5,6],ymm0[4,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm4[0,3],ymm1[5,6],ymm4[4,7]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[3,0],ymm0[6,4],ymm14[7,4]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4]
-; AVX-NEXT: vmovaps 304(%rdi), %xmm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 304(%rdi), %xmm4
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm4[0,3],ymm1[5,6],ymm4[4,7]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2734,124 +2739,132 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[3,0],ymm0[6,4],ymm15[7,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,0],ymm0[2,0],ymm15[4,4],ymm0[6,4]
-; AVX-NEXT: vmovaps 688(%rdi), %xmm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,0],ymm2[3,0],ymm6[6,4],ymm2[7,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0],ymm0[2,0],ymm2[4,4],ymm0[6,4]
+; AVX-NEXT: vmovaps 688(%rdi), %xmm2
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7]
+; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4]
-; AVX-NEXT: vmovaps 16(%rdi), %xmm4
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm11[2,0],ymm10[3,0],ymm11[6,4],ymm10[7,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,0],ymm0[2,0],ymm10[4,4],ymm0[6,4]
+; AVX-NEXT: vmovaps 16(%rdi), %xmm6
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm4[0,3],ymm2[5,6],ymm4[4,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm0[2],ymm4[3,4],ymm0[5],ymm4[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm6[0,3],ymm2[5,6],ymm6[4,7]
; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,0],ymm2[3,0],ymm5[6,4],ymm2[7,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4]
-; AVX-NEXT: vmovaps 208(%rdi), %xmm5
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,0],ymm5[3,0],ymm7[6,4],ymm5[7,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0],ymm1[2,0],ymm5[4,4],ymm1[6,4]
+; AVX-NEXT: vmovaps 208(%rdi), %xmm8
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,2],ymm5[0,3],ymm3[5,6],ymm5[4,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,2],ymm8[0,3],ymm3[5,6],ymm8[4,7]
; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5]
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[2,0],ymm9[3,0],ymm8[6,4],ymm9[7,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm12[2,0],ymm9[3,0],ymm12[6,4],ymm9[7,4]
; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,0],ymm2[2,0],ymm9[4,4],ymm2[6,4]
-; AVX-NEXT: vmovaps 400(%rdi), %xmm8
+; AVX-NEXT: vmovaps 400(%rdi), %xmm12
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm8[0,3],ymm15[5,6],ymm8[4,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm12[0,3],ymm15[5,6],ymm12[4,7]
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5]
; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5,6,7]
; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm13[2,0],ymm11[3,0],ymm13[6,4],ymm11[7,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,0],ymm3[2,0],ymm11[4,4],ymm3[6,4]
-; AVX-NEXT: vmovaps 592(%rdi), %xmm9
+; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm13[3,0],ymm3[6,4],ymm13[7,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0],ymm3[2,0],ymm13[4,4],ymm3[6,4]
+; AVX-NEXT: vmovaps 592(%rdi), %xmm13
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm10[2],ymm3[3,4],ymm10[5],ymm3[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm9[0,3],ymm14[5,6],ymm9[4,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm13[0,3],ymm14[5,6],ymm13[4,7]
; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5]
-; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7]
-; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm15[5,6,7]
+; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload
; AVX-NEXT: # ymm14 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm13[1,0],ymm14[2,0],ymm13[5,4],ymm14[6,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm11[1,0],ymm14[2,0],ymm11[5,4],ymm14[6,4]
; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm15[0,3],ymm14[6,4],ymm15[4,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
; AVX-NEXT: # ymm15 = ymm15[0,1],mem[0,3],ymm15[4,5],mem[4,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[0,3],ymm0[6,4],ymm7[4,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
-; AVX-NEXT: # ymm7 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
-; AVX-NEXT: # ymm0 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm15[5,6,7]
+; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps %ymm0, %ymm15
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,0],ymm14[2,0],ymm6[5,4],ymm14[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[0,3],ymm0[6,4],ymm15[4,7]
+; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX-NEXT: # ymm10 = ymm10[0,1],mem[0,3],ymm10[4,5],mem[4,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT: # ymm0 = ymm11[0,1],mem[2],ymm11[3,4],mem[5],ymm11[6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[0,3],ymm0[6,4],ymm14[4,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm11[0,3],ymm0[6,4],ymm11[4,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT: # ymm14 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,0],ymm0[2,0],ymm5[5,4],ymm0[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[0,3],ymm0[6,4],ymm6[4,7]
+; AVX-NEXT: vshufps $196, (%rsp), %ymm4, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT: # ymm11 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm11[5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm8[1,0],ymm0[2,0],ymm8[5,4],ymm0[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[0,3],ymm0[6,4],ymm1[4,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX-NEXT: # ymm1 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT: # ymm1 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT: # ymm1 = ymm8[0,1],mem[2],ymm8[3,4],mem[5],ymm8[6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[0,3],ymm1[6,4],ymm6[4,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm8[0,3],ymm1[6,4],ymm8[4,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload
+; AVX-NEXT: # ymm8 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT: vmovaps %ymm2, %ymm4
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,0],ymm8[2,0],ymm12[5,4],ymm8[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm4[0,3],ymm2[6,4],ymm4[4,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
; AVX-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm2[2],ymm12[3,4],ymm2[5],ymm12[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm12[0,3],ymm2[6,4],ymm12[4,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm6[5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
+; AVX-NEXT: # ymm6 = ymm7[0,1],mem[2],ymm7[3,4],mem[5],ymm7[6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT: # ymm5 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT: # ymm5 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,0],ymm6[2,0],ymm4[5,4],ymm6[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm7[0,3],ymm6[6,4],ymm7[4,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,0],ymm5[2,0],ymm4[5,4],ymm5[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm6[0,3],ymm5[6,4],ymm6[4,7]
+; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT: # ymm7 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT: vmovaps %ymm3, %ymm4
+; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1],ymm9[2],ymm3[3,4],ymm9[5],ymm3[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm13[1,0],ymm7[2,0],ymm13[5,4],ymm7[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[0,3],ymm3[6,4],ymm4[4,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[0,3],ymm3[6,4],ymm10[4,7]
-; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT: # ymm4 = ymm11[0,1],mem[0,3],ymm11[4,5],mem[4,7]
+; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT: # ymm4 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7]
; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm4, 192(%rsi)
@@ -2886,14 +2899,15 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm4, 32(%rdx)
; AVX-NEXT: vmovaps %ymm3, 192(%rcx)
-; AVX-NEXT: vmovaps %ymm5, 224(%rcx)
+; AVX-NEXT: vmovaps %ymm6, 224(%rcx)
; AVX-NEXT: vmovaps %ymm2, 128(%rcx)
; AVX-NEXT: vmovaps %ymm1, 160(%rcx)
; AVX-NEXT: vmovaps %ymm0, 64(%rcx)
-; AVX-NEXT: vmovaps %ymm14, 96(%rcx)
-; AVX-NEXT: vmovaps %ymm7, (%rcx)
-; AVX-NEXT: vmovaps %ymm13, 32(%rcx)
-; AVX-NEXT: addq $1384, %rsp # imm = 0x568
+; AVX-NEXT: vmovaps %ymm11, 96(%rcx)
+; AVX-NEXT: vmovaps %ymm10, (%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm0, 32(%rcx)
+; AVX-NEXT: addq $1512, %rsp # imm = 0x5E8
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
index c08442f9d9d01..571e1435b75f1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
@@ -359,25 +359,25 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3]
; AVX-NEXT: vmovaps (%rdi), %xmm3
; AVX-NEXT: vmovaps 32(%rdi), %xmm4
-; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm4[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,2,3,3]
-; AVX-NEXT: vmovaps 64(%rdi), %xmm6
-; AVX-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[0]
+; AVX-NEXT: vmovaps 64(%rdi), %xmm5
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm4[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],mem[2],xmm6[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,2,3,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0,1,2],xmm5[0]
; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7
; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2],xmm7[3]
; AVX-NEXT: vshufpd {{.*#+}} xmm7 = xmm7[1,0]
-; AVX-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[1]
+; AVX-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm5[1]
; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,0],mem[1,3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[2]
+; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[2]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[3]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
; AVX-NEXT: vmovaps %xmm2, (%rsi)
-; AVX-NEXT: vmovaps %xmm5, (%rdx)
+; AVX-NEXT: vmovaps %xmm6, (%rdx)
; AVX-NEXT: vmovaps %xmm7, (%rcx)
; AVX-NEXT: vmovaps %xmm3, (%r8)
; AVX-NEXT: vmovaps %xmm0, (%r9)
@@ -1011,23 +1011,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; AVX512-NEXT: vpbroadcastd 144(%rdi), %ymm4
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512-NEXT: vmovdqa 128(%rdi), %ymm6
-; AVX512-NEXT: vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512-NEXT: vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512-NEXT: vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512-NEXT: vpermd %ymm5, %ymm4, %ymm4
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512-NEXT: vpermd %ymm5, %ymm6, %ymm6
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512-NEXT: vpermd %ymm5, %ymm7, %ymm5
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512-NEXT: vmovdqa %ymm3, (%rdx)
-; AVX512-NEXT: vmovdqa %ymm5, (%rcx)
-; AVX512-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512-NEXT: vmovdqa %ymm6, (%r8)
; AVX512-NEXT: vmovdqa %ymm1, (%r9)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1044,23 +1047,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; AVX512-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm4
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm6
-; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm6
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm5
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r8)
; AVX512-FCP-NEXT: vmovdqa %ymm1, (%r9)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -1077,23 +1083,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; AVX512DQ-NEXT: vpbroadcastd 144(%rdi), %ymm4
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-NEXT: vpermd %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512DQ-NEXT: vpermd %ymm5, %ymm6, %ymm6
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512DQ-NEXT: vpermd %ymm5, %ymm7, %ymm5
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx)
-; AVX512DQ-NEXT: vmovdqa %ymm5, (%rcx)
-; AVX512DQ-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512DQ-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512DQ-NEXT: vmovdqa %ymm6, (%r8)
; AVX512DQ-NEXT: vmovdqa %ymm1, (%r9)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@@ -1110,23 +1119,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; AVX512DQ-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm4
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r8)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -1143,23 +1155,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; AVX512BW-NEXT: vpbroadcastd 144(%rdi), %ymm4
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6
-; AVX512BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512BW-NEXT: vpermd %ymm5, %ymm4, %ymm4
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512BW-NEXT: vpermd %ymm5, %ymm6, %ymm6
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512BW-NEXT: vpermd %ymm5, %ymm7, %ymm5
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx)
-; AVX512BW-NEXT: vmovdqa %ymm5, (%rcx)
-; AVX512BW-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512BW-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512BW-NEXT: vmovdqa %ymm6, (%r8)
; AVX512BW-NEXT: vmovdqa %ymm1, (%r9)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
@@ -1176,23 +1191,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; AVX512BW-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm4
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6
-; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm4
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm6
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm5
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r8)
; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%r9)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
@@ -1209,23 +1227,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; AVX512DQ-BW-NEXT: vpbroadcastd 144(%rdi), %ymm4
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512DQ-BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512DQ-BW-NEXT: vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-BW-NEXT: vpermd %ymm5, %ymm4, %ymm4
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512DQ-BW-NEXT: vpermd %ymm5, %ymm6, %ymm6
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512DQ-BW-NEXT: vpermd %ymm5, %ymm7, %ymm5
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rcx)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%r8)
; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%r9)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
@@ -1242,23 +1263,26 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 144(%rdi), %ymm4
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,7,12,17,22,27,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,8,13,18,23,28,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [4,9,14,19,24,29,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm6, %ymm4, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,5,0,5,0,5,0,5]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,7,12,17,22,27,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [1,6,1,6,1,6,1,6]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,8,13,18,23,28,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [4,9,14,19,24,29,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm5[6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r8)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%r9)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
@@ -1563,9 +1587,9 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm2[1,3],ymm1[6,5],ymm2[5,7]
; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm7[2,3],ymm2[4,5],ymm7[6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm4[3,0],ymm1[6,4],ymm4[7,4]
-; AVX-NEXT: vmovaps 160(%rdi), %xmm9
-; AVX-NEXT: vmovaps 192(%rdi), %xmm8
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm8[2,3]
+; AVX-NEXT: vmovaps 192(%rdi), %xmm9
+; AVX-NEXT: vmovaps 160(%rdi), %xmm8
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm9[2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3]
; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7]
@@ -1605,7 +1629,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4]
; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,0],ymm2[3,0],ymm7[4,4],ymm2[7,4]
; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0],ymm2[2,2],ymm9[6,4],ymm2[6,6]
; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3]
@@ -3082,11 +3106,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm8[1,3],ymm0[6,5],ymm8[5,7]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT: vmovaps 160(%rdi), %xmm1
+; AVX-NEXT: vmovaps 192(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 192(%rdi), %xmm0
+; AVX-NEXT: vmovaps 160(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3,4,5,6,7]
@@ -3098,11 +3122,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm5[2,3],ymm14[4,5],ymm5[6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT: vmovaps 480(%rdi), %xmm2
+; AVX-NEXT: vmovaps 512(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 512(%rdi), %xmm1
+; AVX-NEXT: vmovaps 480(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -3130,11 +3154,11 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm4[2,3],ymm10[4,5],ymm4[6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT: vmovaps 320(%rdi), %xmm3
+; AVX-NEXT: vmovaps 352(%rdi), %xmm3
; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 352(%rdi), %xmm1
+; AVX-NEXT: vmovaps 320(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -3203,8 +3227,8 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm4[0,0],ymm8[3,0],ymm4[4,4],ymm8[7,4]
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm8[2,2],ymm15[6,4],ymm8[6,6]
@@ -3215,8 +3239,8 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm3[0,0],ymm14[3,0],ymm3[4,4],ymm14[7,4]
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm14[2,2],ymm15[6,4],ymm14[6,6]
@@ -3239,8 +3263,8 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4]
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6]
@@ -6144,11 +6168,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT: vmovaps 160(%rdi), %xmm4
+; AVX-NEXT: vmovaps 192(%rdi), %xmm4
; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 192(%rdi), %xmm1
+; AVX-NEXT: vmovaps 160(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6161,11 +6185,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm5[1,3],ymm0[6,5],ymm5[5,7]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT: vmovaps 480(%rdi), %xmm2
+; AVX-NEXT: vmovaps 512(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 512(%rdi), %xmm1
+; AVX-NEXT: vmovaps 480(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6178,11 +6202,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm1[1,3],ymm0[6,5],ymm1[5,7]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5],ymm9[6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT: vmovaps 800(%rdi), %xmm2
+; AVX-NEXT: vmovaps 832(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 832(%rdi), %xmm1
+; AVX-NEXT: vmovaps 800(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6194,11 +6218,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm13[1,3],ymm0[6,5],ymm13[5,7]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm15[2,3],ymm13[4,5],ymm15[6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT: vmovaps 1120(%rdi), %xmm2
+; AVX-NEXT: vmovaps 1152(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 1152(%rdi), %xmm1
+; AVX-NEXT: vmovaps 1120(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6226,11 +6250,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm14[1,3],ymm0[6,5],ymm14[5,7]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT: vmovaps 320(%rdi), %xmm2
+; AVX-NEXT: vmovaps 352(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 352(%rdi), %xmm1
+; AVX-NEXT: vmovaps 320(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6243,11 +6267,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm6[2,3],ymm12[4,5],ymm6[6,7]
; AVX-NEXT: vmovaps %ymm12, %ymm7
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT: vmovaps 640(%rdi), %xmm2
+; AVX-NEXT: vmovaps 672(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 672(%rdi), %xmm1
+; AVX-NEXT: vmovaps 640(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6261,11 +6285,11 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4]
-; AVX-NEXT: vmovaps 960(%rdi), %xmm2
+; AVX-NEXT: vmovaps 992(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
-; AVX-NEXT: vmovaps 992(%rdi), %xmm1
+; AVX-NEXT: vmovaps 960(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2],xmm1[3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6418,8 +6442,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm5[0,0],ymm1[3,0],ymm5[4,4],ymm1[7,4]
@@ -6433,8 +6457,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4]
@@ -6448,8 +6472,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4]
@@ -6463,8 +6487,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,0],ymm1[3,0],ymm15[4,4],ymm1[7,4]
@@ -6477,8 +6501,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,0],ymm10[3,0],ymm1[4,4],ymm10[7,4]
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm10[2,2],ymm15[6,4],ymm10[6,6]
@@ -6489,8 +6513,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm7, %ymm14
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm9[0,0],ymm7[3,0],ymm9[4,4],ymm7[7,4]
@@ -6503,8 +6527,8 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm8, %ymm7
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm6[0,0],ymm8[3,0],ymm6[4,4],ymm8[7,4]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index 85ed61811af53..bd7730997a2e5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -469,13 +469,14 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovaps 32(%rdi), %ymm0
; AVX-NEXT: vmovaps (%rdi), %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1],xmm3[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,3]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT: vmovaps (%rdi), %xmm3
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1],xmm2[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm2[0,3]
; AVX-NEXT: vmovaps 64(%rdi), %xmm5
; AVX-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[2]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[1,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,0],xmm3[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[1,3]
; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
@@ -491,11 +492,10 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1],xmm6[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,0,2]
; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
; AVX-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero
; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm5[2,3]
; AVX-NEXT: vmovaps %xmm4, (%rsi)
; AVX-NEXT: vmovaps %xmm2, (%rdx)
; AVX-NEXT: vmovaps %xmm3, (%rcx)
@@ -983,74 +983,76 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-LABEL: load_i32_stride6_vf8:
; AVX: # %bb.0:
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovapd 160(%rdi), %ymm3
-; AVX-NEXT: vmovapd 128(%rdi), %ymm4
+; AVX-NEXT: vmovapd 160(%rdi), %ymm2
+; AVX-NEXT: vmovapd 128(%rdi), %ymm3
; AVX-NEXT: vmovaps 96(%rdi), %ymm0
-; AVX-NEXT: vmovaps 64(%rdi), %ymm1
+; AVX-NEXT: vmovaps 64(%rdi), %ymm5
; AVX-NEXT: vmovaps 32(%rdi), %ymm6
; AVX-NEXT: vmovaps (%rdi), %ymm7
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm5, %xmm8
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm8[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[0,3]
-; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm1, %ymm9
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX-NEXT: vmovaps (%rdi), %xmm8
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm4[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,3]
+; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm5, %ymm9
; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm9[0,0],ymm0[6,4],ymm9[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm1[2,2],ymm10[6,4],ymm1[6,6]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm10[3,4,5,6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[2,3],ymm3[0,1]
-; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm10[0],ymm4[1],ymm10[3],ymm4[2]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm5[2,2],ymm10[6,4],ymm5[6,6]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm10[3,4,5,6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm3[2,3],ymm2[0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm10[0],ymm3[1],ymm10[3],ymm3[2]
; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0],xmm8[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm8[1,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm4[3,0],xmm8[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,0],xmm4[1,3]
; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,0],ymm9[1,0],ymm0[7,4],ymm9[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm1[2,3],ymm8[6,4],ymm1[6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[3,1],ymm4[1,3],ymm10[7,5],ymm4[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm5[2,3],ymm8[6,4],ymm5[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[3,1],ymm3[1,3],ymm10[7,5],ymm3[5,7]
; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm8[6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,0],ymm8[2,0],ymm9[4,4],ymm8[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7]
; AVX-NEXT: vextractf128 $1, %ymm7, %xmm11
; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,0],xmm11[2,3]
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm1[2,0],ymm0[6,5],ymm1[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm5[2,0],ymm0[6,5],ymm5[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm10[5,6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[3,1],ymm9[4,5],ymm8[7,5]
; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,1],xmm11[3,3]
-; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1],ymm1[2,1],ymm0[7,5],ymm1[6,5]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,0,1]
-; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,0],ymm4[0,0],ymm3[6,4],ymm4[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2],ymm8[2,0],ymm4[4,6],ymm8[6,4]
-; AVX-NEXT: vmovaps 32(%rdi), %xmm9
-; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,2,3,3]
-; AVX-NEXT: vmovaps 16(%rdi), %xmm11
-; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3]
-; AVX-NEXT: vmovapd 80(%rdi), %xmm12
-; AVX-NEXT: vshufpd {{.*#+}} ymm13 = ymm12[1],ymm1[0],ymm12[2],ymm1[3]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm5[2,1],ymm0[7,5],ymm5[6,5]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm2[2,0],ymm3[0,0],ymm2[6,4],ymm3[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm3[0,2],ymm7[2,0],ymm3[4,6],ymm7[6,4]
+; AVX-NEXT: vmovaps 32(%rdi), %xmm8
+; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm8[2,2,3,3]
+; AVX-NEXT: vmovaps 16(%rdi), %xmm10
+; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2,3]
+; AVX-NEXT: vmovaps 64(%rdi), %xmm11
+; AVX-NEXT: vmovaps 80(%rdi), %xmm12
+; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm12[2,0],xmm11[0,0]
; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,1],ymm13[2,0],ymm0[4,5],ymm13[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm4[1,0],ymm3[7,4],ymm4[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,3],ymm3[2,0],ymm4[4,7],ymm3[6,4]
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm9[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm1[1,3],ymm12[7,5],ymm1[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[2,0],ymm0[5,5],ymm1[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
-; AVX-NEXT: vmovaps %ymm2, (%rsi)
-; AVX-NEXT: vmovaps %ymm5, (%rdx)
+; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm3[1,0],ymm2[7,4],ymm3[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,3],ymm2[2,0],ymm3[4,7],ymm2[6,4]
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm8[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm12[3,0],xmm11[1,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm8[2,0],ymm0[5,5],ymm8[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT: vmovaps %ymm1, (%rsi)
+; AVX-NEXT: vmovaps %ymm4, (%rdx)
; AVX-NEXT: vmovaps %ymm6, (%rcx)
-; AVX-NEXT: vmovaps %ymm7, (%r8)
-; AVX-NEXT: vmovaps %ymm8, (%r9)
+; AVX-NEXT: vmovaps %ymm5, (%r8)
+; AVX-NEXT: vmovaps %ymm7, (%r9)
; AVX-NEXT: vmovaps %ymm0, (%rax)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -1287,360 +1289,392 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-LABEL: load_i32_stride6_vf8:
; AVX512: # %bb.0:
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovdqa 128(%rdi), %ymm0
-; AVX512-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm4
-; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm5
-; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm6
-; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512-NEXT: vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512-NEXT: vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512-NEXT: vpermd %zmm6, %zmm2, %zmm2
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512-NEXT: vpermd %zmm6, %zmm3, %zmm3
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512-NEXT: vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512-NEXT: vmovdqa %ymm7, (%rsi)
-; AVX512-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX512-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX512-NEXT: vmovdqa %ymm3, (%r8)
-; AVX512-NEXT: vmovdqa %ymm6, (%r9)
-; AVX512-NEXT: vmovdqa %ymm4, (%rax)
+; AVX512-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX512-NEXT: vmovdqa 160(%rdi), %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512-NEXT: vpermd %ymm3, %ymm0, %ymm0
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm7
+; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512-NEXT: vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512-NEXT: vpermd %ymm1, %ymm2, %ymm2
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512-NEXT: vmovdqa %ymm8, (%rcx)
+; AVX512-NEXT: vmovdqa %ymm9, (%r8)
+; AVX512-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512-NEXT: vmovdqa %ymm1, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i32_stride6_vf8:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
-; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa %ymm3, (%r8)
-; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r9)
-; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rax)
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm7
+; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa %ymm8, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %ymm9, (%r8)
+; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i32_stride6_vf8:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm0
-; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm4
-; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm5
-; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm6
-; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512DQ-NEXT: vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512DQ-NEXT: vpermd %zmm6, %zmm2, %zmm2
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512DQ-NEXT: vpermd %zmm6, %zmm3, %zmm3
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512DQ-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512DQ-NEXT: vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512DQ-NEXT: vmovdqa %ymm7, (%rsi)
-; AVX512DQ-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX512DQ-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX512DQ-NEXT: vmovdqa %ymm3, (%r8)
-; AVX512DQ-NEXT: vmovdqa %ymm6, (%r9)
-; AVX512DQ-NEXT: vmovdqa %ymm4, (%rax)
+; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512DQ-NEXT: vpermd %ymm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm7
+; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512DQ-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-NEXT: vpermd %ymm1, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512DQ-NEXT: vmovdqa %ymm8, (%rcx)
+; AVX512DQ-NEXT: vmovdqa %ymm9, (%r8)
+; AVX512DQ-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512DQ-NEXT: vmovdqa %ymm1, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i32_stride6_vf8:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i32_stride6_vf8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm4
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm5
-; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm6
-; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512BW-NEXT: vpermd %zmm6, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512BW-NEXT: vpermd %zmm6, %zmm3, %zmm3
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512BW-NEXT: vmovdqa %ymm7, (%rsi)
-; AVX512BW-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX512BW-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX512BW-NEXT: vmovdqa %ymm3, (%r8)
-; AVX512BW-NEXT: vmovdqa %ymm6, (%r9)
-; AVX512BW-NEXT: vmovdqa %ymm4, (%rax)
+; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm2
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512BW-NEXT: vpermd %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm7
+; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512BW-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512BW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512BW-NEXT: vpermd %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512BW-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512BW-NEXT: vmovdqa %ymm8, (%rcx)
+; AVX512BW-NEXT: vmovdqa %ymm9, (%r8)
+; AVX512BW-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512BW-NEXT: vmovdqa %ymm1, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i32_stride6_vf8:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
-; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5
-; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
-; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r9)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512BW-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm7
+; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512BW-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512BW-FCP-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i32_stride6_vf8:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm0
-; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm4
-; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm5
-; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm6
-; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512DQ-BW-NEXT: vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm3, %zmm3
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512DQ-BW-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512DQ-BW-NEXT: vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%rsi)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%r8)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%r9)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rax)
+; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm2
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512DQ-BW-NEXT: vpermd %ymm3, %ymm0, %ymm0
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm7
+; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512DQ-BW-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512DQ-BW-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-BW-NEXT: vpermd %ymm1, %ymm2, %ymm2
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-BW-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512DQ-BW-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%rcx)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm9, (%r8)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i32_stride6_vf8:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,6,12,18,24,30,0,0]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,1,2,3,4,5,12,10]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,7,13,19,25,31,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,1,2,3,4,5,13,11]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,2,8,0,0,6,12]
-; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [2,8,14,20,26,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,3,9,0,1,7,13]
-; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm3, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [3,9,15,21,27,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [20,26,0,6,12,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,1,2,3,4,10,8,14]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [21,27,1,7,13,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,11,9,15]
-; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,2,4,2,4,2,4,2]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,6,12,18,24,30,0,0]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [5,3,5,3,5,3,5,3]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,7,13,19,25,31,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,16,22,28]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,8,14,20,26,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,1,2,3,4,17,23,29]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,9,15,21,27,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm5, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6]
+; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [20,26,0,6,12,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7]
+; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [21,27,1,7,13,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm5, %zmm6, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <48 x i32>, ptr %in.vec, align 64
@@ -1953,161 +1987,166 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX-LABEL: load_i32_stride6_vf16:
; AVX: # %bb.0:
-; AVX-NEXT: subq $264, %rsp # imm = 0x108
-; AVX-NEXT: vmovaps 224(%rdi), %ymm5
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 192(%rdi), %ymm10
+; AVX-NEXT: subq $296, %rsp # imm = 0x128
+; AVX-NEXT: vmovaps 224(%rdi), %ymm10
; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 160(%rdi), %ymm0
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 128(%rdi), %ymm7
-; AVX-NEXT: vmovaps 96(%rdi), %ymm9
-; AVX-NEXT: vmovaps 64(%rdi), %ymm15
-; AVX-NEXT: vmovaps 32(%rdi), %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps (%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm6[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,3]
-; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm15, %ymm8
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[2,0],ymm8[0,0],ymm9[6,4],ymm8[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm15[2,2],ymm4[6,4],ymm15[6,6]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX-NEXT: vmovapd %ymm7, %ymm1
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm0[0,1]
-; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm7[0],ymm1[1],ymm7[3],ymm1[2]
-; AVX-NEXT: vmovapd %ymm1, %ymm13
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX-NEXT: vmovaps 192(%rdi), %ymm14
+; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 160(%rdi), %ymm4
+; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 128(%rdi), %ymm13
+; AVX-NEXT: vmovaps 96(%rdi), %ymm15
+; AVX-NEXT: vmovaps 64(%rdi), %ymm12
+; AVX-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps (%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm5[4,5],ymm10[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm3[0,2],xmm0[0,3]
-; AVX-NEXT: vmovaps 288(%rdi), %ymm11
-; AVX-NEXT: vmovaps 256(%rdi), %ymm10
-; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm10, %ymm5
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm11[2,0],ymm5[0,0],ymm11[6,4],ymm5[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm10[2,2],ymm14[6,4],ymm10[6,6]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2],ymm14[3,4,5,6,7]
-; AVX-NEXT: vmovapd 352(%rdi), %ymm3
-; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 320(%rdi), %ymm12
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm12[2,3],ymm3[0,1]
-; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm3[0],ymm12[1],ymm3[3],ymm12[2]
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm14[6,7]
-; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm6[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[1,3]
-; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,0],ymm8[1,0],ymm9[7,4],ymm8[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm15[2,3],ymm4[6,4],ymm15[6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovaps (%rdi), %xmm11
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm1[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,3]
+; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm12, %ymm8
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm15[2,0],ymm8[0,0],ymm15[6,4],ymm8[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm12[2,2],ymm5[6,4],ymm12[6,6]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm13[2,3],ymm4[0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm9 = ymm7[0],ymm13[1],ymm7[3],ymm13[2]
+; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm9[6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm10[4,5],ymm14[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm0
+; AVX-NEXT: vmovaps 192(%rdi), %xmm5
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm0[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm6[0,2],xmm0[0,3]
+; AVX-NEXT: vmovaps 288(%rdi), %ymm6
+; AVX-NEXT: vmovaps 256(%rdi), %ymm14
+; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm14, %ymm4
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm6[2,0],ymm4[0,0],ymm6[6,4],ymm4[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm14[2,2],ymm10[6,4],ymm14[6,6]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2],ymm10[3,4,5,6,7]
+; AVX-NEXT: vmovapd 352(%rdi), %ymm2
+; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 320(%rdi), %ymm10
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm10[2,3],ymm2[0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm9 = ymm2[0],ymm10[1],ymm2[3],ymm10[2]
+; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,0],xmm11[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,0],xmm1[1,3]
+; AVX-NEXT: vmovups %ymm15, (%rsp) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm15[3,0],ymm8[1,0],ymm15[7,4],ymm8[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm12[2,3],ymm3[6,4],ymm12[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,1],ymm13[1,3],ymm7[7,5],ymm13[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,3]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[3,0],ymm5[1,0],ymm11[7,4],ymm5[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm10[2,3],ymm1[6,4],ymm10[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm7[3,1],ymm13[1,3],ymm7[7,5],ymm13[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,0],xmm5[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[1,3]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm4[1,0],ymm6[7,4],ymm4[5,4]
+; AVX-NEXT: vmovaps %ymm6, %ymm4
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm14[2,3],ymm1[6,4],ymm14[6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1],ymm12[1,3],ymm3[7,5],ymm12[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,1],ymm10[1,3],ymm2[7,5],ymm10[5,7]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $12, (%rsp), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[2,1],ymm15[2,0],ymm9[6,5],ymm15[6,4]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm7
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm7[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1,2,3],ymm9[4,5],ymm13[6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm3[2,0],ymm4[4,4],ymm3[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm5[5,6,7]
-; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT: # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,1],ymm10[2,0],ymm11[6,5],ymm10[6,4]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,1],ymm12[2,0],ymm15[6,5],ymm12[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1]
-; AVX-NEXT: vextractf128 $1, %ymm8, %xmm14
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm14[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm6[2,0],ymm0[4,4],ymm6[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm5[5,6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,0],xmm3[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm6[4,5],ymm13[6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,0],ymm7[2,0],ymm8[4,4],ymm7[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[3,1],ymm4[4,5],ymm3[7,5]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1],xmm7[3,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,1],ymm15[2,1],ymm1[7,5],ymm15[6,5]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm6[3,1],ymm0[4,5],ymm6[7,5]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm8[3,1],xmm14[3,3]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm11[3,1],ymm10[2,1],ymm11[7,5],ymm10[6,5]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT: # ymm1 = ymm1[0,1],mem[2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vmovaps %ymm4, %ymm2
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm14[2,0],ymm4[6,5],ymm14[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm9
+; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm1[2,0],xmm9[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2],ymm4[3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm13[4,5],ymm10[6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm15[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,0],ymm15[2,0],ymm11[4,4],ymm15[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7]
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1],ymm7[3,1],ymm8[4,5],ymm7[7,5]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],xmm3[3,3]
+; AVX-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm12[2,1],ymm4[7,5],ymm12[6,5]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT: # ymm9 = ymm9[0,1],mem[2,3],ymm9[4,5,6,7]
-; AVX-NEXT: vmovaps 32(%rdi), %xmm3
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,2,3,3]
-; AVX-NEXT: vmovaps 16(%rdi), %xmm5
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3]
-; AVX-NEXT: vmovapd 80(%rdi), %xmm6
-; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm6[1],ymm15[0],ymm6[2],ymm15[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm1[0,1],ymm7[2,0],ymm1[4,5],ymm7[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5,6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm9[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm9[2,0],ymm7[0,0],ymm9[6,4],ymm7[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm7[0,2],ymm8[2,0],ymm7[4,6],ymm8[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,1],ymm15[3,1],ymm11[4,5],ymm15[7,5]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1],xmm9[3,3]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[3,1],ymm14[2,1],ymm2[7,5],ymm14[6,5]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7]
-; AVX-NEXT: vmovaps 224(%rdi), %xmm12
-; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,3,3]
-; AVX-NEXT: vmovaps 208(%rdi), %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0],xmm14[1],xmm0[2,3]
-; AVX-NEXT: vmovapd 272(%rdi), %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm1[1],ymm10[0],ymm1[2],ymm10[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,0],ymm11[4,5],ymm4[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm4[2,3,4,5,6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[2,0],ymm14[0,0],ymm8[6,4],ymm14[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,2],ymm2[2,0],ymm14[4,6],ymm2[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,0],ymm7[1,0],ymm9[7,4],ymm7[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,3],ymm4[2,0],ymm7[4,7],ymm4[6,4]
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1],ymm15[1,3],ymm6[7,5],ymm15[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,1],ymm5[2,0],ymm6[5,5],ymm5[6,4]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm14[1,0],ymm8[7,4],ymm14[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,3],ymm4[2,0],ymm14[4,7],ymm4[6,4]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm10[1,3],ymm1[7,5],ymm10[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],ymm1[2,0],ymm11[5,5],ymm1[6,4]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT: # ymm7 = ymm6[0,1],mem[2,3],ymm6[4,5,6,7]
+; AVX-NEXT: vmovaps 32(%rdi), %xmm14
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,3,3]
+; AVX-NEXT: vmovaps 16(%rdi), %xmm15
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0],xmm0[1],xmm15[2,3]
+; AVX-NEXT: vmovaps 64(%rdi), %xmm3
+; AVX-NEXT: vmovaps 80(%rdi), %xmm2
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm2[2,0],xmm3[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,1],ymm8[2,0],ymm4[4,5],ymm8[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3,4,5,6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm7[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm7[2,0],ymm9[0,0],ymm7[6,4],ymm9[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,2],ymm8[2,0],ymm9[4,6],ymm8[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1],ymm10[2,3],ymm13[4,5,6,7]
+; AVX-NEXT: vmovaps 224(%rdi), %xmm10
+; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm10[2,2,3,3]
+; AVX-NEXT: vmovaps 208(%rdi), %xmm13
+; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0],xmm11[1],xmm13[2,3]
+; AVX-NEXT: vmovaps 256(%rdi), %xmm6
+; AVX-NEXT: vmovaps 272(%rdi), %xmm0
+; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm0[2,0],xmm6[0,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm1[0,1],ymm12[2,0],ymm1[4,5],ymm12[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm5[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm5[2,0],ymm12[0,0],ymm5[6,4],ymm12[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,2],ymm4[2,0],ymm12[4,6],ymm4[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm9[1,0],ymm7[7,4],ymm9[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,3],ymm7[2,0],ymm9[4,7],ymm7[6,4]
+; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm14[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,0],xmm3[1,0]
+; AVX-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1],ymm2[2,0],ymm3[5,5],ymm2[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm9[1,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,0],ymm12[1,0],ymm5[7,4],ymm12[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm12[0,3],ymm3[2,0],ymm12[4,7],ymm3[6,4]
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm10[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],xmm6[1,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,0],ymm1[5,5],ymm0[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm1, 32(%rsi)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -2118,19 +2157,18 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %ymm1, (%rdx)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
-; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm1, (%rcx)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm1, 32(%r8)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm1, (%r8)
-; AVX-NEXT: vmovaps %ymm2, 32(%r9)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, (%r9)
+; AVX-NEXT: vmovaps %ymm4, 32(%r9)
+; AVX-NEXT: vmovaps %ymm8, (%r9)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: vmovaps %ymm0, 32(%rax)
-; AVX-NEXT: vmovaps %ymm3, (%rax)
-; AVX-NEXT: addq $264, %rsp # imm = 0x108
+; AVX-NEXT: vmovaps %ymm2, (%rax)
+; AVX-NEXT: addq $296, %rsp # imm = 0x128
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -4000,387 +4038,399 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX-LABEL: load_i32_stride6_vf32:
; AVX: # %bb.0:
-; AVX-NEXT: subq $1048, %rsp # imm = 0x418
+; AVX-NEXT: subq $1032, %rsp # imm = 0x408
; AVX-NEXT: vmovaps 416(%rdi), %ymm7
; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 384(%rdi), %ymm9
-; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 160(%rdi), %ymm5
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 128(%rdi), %ymm6
-; AVX-NEXT: vmovaps 96(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 64(%rdi), %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 32(%rdi), %ymm0
+; AVX-NEXT: vmovaps 384(%rdi), %ymm8
+; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 160(%rdi), %ymm4
+; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 128(%rdi), %ymm5
+; AVX-NEXT: vmovaps 96(%rdi), %ymm12
+; AVX-NEXT: vmovaps 64(%rdi), %ymm6
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 32(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX-NEXT: vmovaps (%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps (%rdi), %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm8, %xmm4
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm4[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,3]
-; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm3, %ymm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT: vmovaps (%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,3]
+; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm6, %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,0],ymm1[0,0],ymm12[6,4],ymm1[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[2,2],ymm1[6,4],ymm6[6,6]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm5[0,1]
+; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1]
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[3],ymm6[2]
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm7[4,5],ymm9[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm11, %xmm3
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm3[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,3]
-; AVX-NEXT: vmovaps 480(%rdi), %ymm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vmovaps 384(%rdi), %xmm7
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm2[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3]
+; AVX-NEXT: vmovaps 480(%rdi), %ymm6
+; AVX-NEXT: vmovaps 448(%rdi), %ymm5
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm5, %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 448(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm2, %ymm9
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm9[0,0],ymm1[6,4],ymm9[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,2],ymm1[6,4],ymm2[6,6]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[2,0],ymm1[0,0],ymm6[6,4],ymm1[4,4]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[2,2],ymm1[6,4],ymm5[6,6]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX-NEXT: vmovapd 544(%rdi), %ymm1
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 512(%rdi), %ymm2
-; AVX-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
+; AVX-NEXT: vmovapd 512(%rdi), %ymm4
+; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm1[0,1]
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[3],ymm4[2]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 224(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 192(%rdi), %ymm1
+; AVX-NEXT: vmovaps 224(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm10, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm1[2,3]
+; AVX-NEXT: vmovaps 192(%rdi), %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovaps 192(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
-; AVX-NEXT: vmovaps 288(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 288(%rdi), %ymm4
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 256(%rdi), %ymm5
; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm5, %ymm6
-; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm6[0,0],ymm2[6,4],ymm6[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm5[2,2],ymm2[6,4],ymm5[6,6]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7]
-; AVX-NEXT: vmovapd 352(%rdi), %ymm2
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 320(%rdi), %ymm5
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm5[2,3],ymm2[0,1]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[3],ymm5[2]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm5, %ymm8
+; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm8[0,0],ymm4[6,4],ymm8[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm5[2,2],ymm4[6,4],ymm5[6,6]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7]
+; AVX-NEXT: vmovapd 352(%rdi), %ymm4
+; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 320(%rdi), %ymm11
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm11[2,3],ymm4[0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm14[0],ymm11[1],ymm14[3],ymm11[2]
+; AVX-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 608(%rdi), %ymm0
+; AVX-NEXT: vmovaps 608(%rdi), %ymm4
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 576(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 576(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm7, %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm2[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm2[0,3]
-; AVX-NEXT: vmovaps 672(%rdi), %ymm5
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX-NEXT: vmovaps 576(%rdi), %xmm10
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1],xmm5[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,3]
+; AVX-NEXT: vmovaps 672(%rdi), %ymm8
+; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 640(%rdi), %ymm0
-; AVX-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm15
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm5[2,0],ymm15[0,0],ymm5[6,4],ymm15[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm0[2,2],ymm14[6,4],ymm0[6,6]
-; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1,2],ymm14[3,4,5,6,7]
-; AVX-NEXT: vmovapd 736(%rdi), %ymm0
+; AVX-NEXT: vinsertf128 $1, 672(%rdi), %ymm0, %ymm9
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm8[2,0],ymm9[0,0],ymm8[6,4],ymm9[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm0[2,2],ymm15[6,4],ymm0[6,6]
+; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0,1,2],ymm15[3,4,5,6,7]
+; AVX-NEXT: vmovapd 736(%rdi), %ymm4
+; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 704(%rdi), %ymm0
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 704(%rdi), %ymm5
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm5[2,3],ymm0[0,1]
-; AVX-NEXT: vshufpd {{.*#+}} ymm13 = ymm12[0],ymm5[1],ymm12[3],ymm5[2]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3],ymm4[0,1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm13 = ymm8[0],ymm0[1],ymm8[3],ymm0[2]
; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm13[6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],xmm4[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[0,2],xmm4[1,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT: # ymm8 = ymm13[3,0],mem[1,0],ymm13[7,4],mem[5,4]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm14[2,3],ymm8[6,4],ymm14[6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7]
+; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload
+; AVX-NEXT: # xmm13 = xmm3[3,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm13[2,0],xmm3[1,3]
+; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
+; AVX-NEXT: # ymm13 = ymm12[3,0],mem[1,0],ymm12[7,4],mem[5,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm4[2,3],ymm13[6,4],ymm4[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm5[1,3],ymm0[7,5],ymm5[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm8[6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm11[1,0],xmm3[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,2],xmm3[1,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm9[1,0],ymm8[7,4],ymm9[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm0[3,1],ymm15[1,3],ymm0[7,5],ymm15[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,0],xmm7[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[1,3]
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = ymm6[3,0],mem[1,0],ymm6[7,4],mem[5,4]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm6[2,3],ymm4[6,4],ymm6[6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,1],ymm9[1,3],ymm0[7,5],ymm9[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm4[6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm10[1,0],xmm1[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[0,2],xmm1[1,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT: # ymm3 = ymm0[3,0],mem[1,0],ymm0[7,4],mem[5,4]
-; AVX-NEXT: vshufps $226, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT: # ymm3 = ymm3[2,0],mem[2,3],ymm3[6,4],mem[6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm6[2,3],ymm3[6,4],ymm6[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT: # ymm3 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,1],ymm7[1,3],ymm0[7,5],ymm7[5,7]
; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm3[6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,0],xmm2[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm2[1,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm1[3,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[1,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT: # ymm2 = ymm0[3,0],mem[1,0],ymm0[7,4],mem[5,4]
+; AVX-NEXT: vshufps $226, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT: # ymm2 = ymm2[2,0],mem[2,3],ymm2[6,4],mem[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,1],ymm11[1,3],ymm14[7,5],ymm11[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,0],xmm10[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm5[1,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,0],ymm9[1,0],ymm14[7,4],ymm9[5,4]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,0],ymm15[1,0],ymm10[7,4],ymm15[5,4]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm11[2,3],ymm1[6,4],ymm11[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm10[2,3],ymm1[6,4],ymm10[6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm15[1,3],ymm12[7,5],ymm15[5,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,1],ymm13[1,3],ymm8[7,5],ymm13[5,7]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT: # ymm4 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[2,1],ymm14[2,0],ymm13[6,5],ymm14[6,4]
+; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,1],ymm4[2,0],ymm12[6,5],ymm4[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
-; AVX-NEXT: vextractf128 $1, %ymm4, %xmm0
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm0[2,3]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,0],xmm1[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
-; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT: # ymm3 = ymm5[0,1,2,3],mem[4,5],ymm5[6,7]
+; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm3[2,0],ymm0[4,4],ymm3[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm5[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm8[2,1],ymm6[2,0],ymm8[6,5],ymm6[6,4]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,1],ymm6[2,0],ymm0[6,5],ymm6[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,0,1]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,0],xmm1[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7]
-; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT: # ymm2 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm2[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm6[0,0],ymm2[2,0],ymm6[4,4],ymm2[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,0],xmm0[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
+; AVX-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm9[2,3,0,1]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm9[2,0],ymm0[4,4],ymm9[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm6[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,1],ymm11[2,0],ymm10[6,5],ymm11[6,4]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm7[2,3,0,1]
-; AVX-NEXT: vextractf128 $1, %ymm5, %xmm0
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT: # ymm11 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm14[2,1],ymm10[2,0],ymm14[6,5],ymm10[6,4]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm5[2,3,0,1]
+; AVX-NEXT: vextractf128 $1, %ymm11, %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,0],xmm0[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2],ymm10[3,4,5,6,7]
-; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
-; AVX-NEXT: # ymm10 = ymm15[0,1,2,3],mem[4,5],ymm15[6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm10[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm11[0,0],ymm10[2,0],ymm11[4,4],ymm10[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm13[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm11[2,0],xmm0[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7]
+; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload
+; AVX-NEXT: # ymm10 = ymm13[0,1,2,3],mem[4,5],ymm13[6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm10[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0],ymm10[2,0],ymm13[4,4],ymm10[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm15[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm9[2,1],ymm8[2,0],ymm9[6,5],ymm8[6,4]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm13[2,3,0,1]
-; AVX-NEXT: vextractf128 $1, %ymm12, %xmm14
-; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,0],xmm14[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX-NEXT: # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,1],ymm6[2,0],ymm5[6,5],ymm6[6,4]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,0,1]
+; AVX-NEXT: vextractf128 $1, %ymm8, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm8[2,0],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm15[3,4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
; AVX-NEXT: # ymm15 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm3[3,1],ymm1[4,5],ymm3[7,5]
-; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = xmm4[3,1],mem[3,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[3,1],ymm7[2,1],ymm13[7,5],ymm7[6,5]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,0,1]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm2[3,1],ymm6[4,5],ymm2[7,5]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,0],ymm15[2,0],ymm0[4,4],ymm15[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm12[5,6,7]
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1],ymm3[3,1],ymm4[4,5],ymm3[7,5]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vshufps $247, (%rsp), %xmm3, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT: # xmm7 = xmm3[3,1],mem[3,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload
+; AVX-NEXT: # ymm12 = ymm14[3,1],mem[2,1],ymm14[7,5],mem[6,5]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
+; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,1],ymm9[3,1],ymm3[4,5],ymm9[7,5]
; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
; AVX-NEXT: # xmm2 = xmm2[3,1],mem[3,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,1],ymm6[2,1],ymm4[7,5],ymm6[6,5]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = ymm7[3,1],mem[2,1],ymm7[7,5],mem[6,5]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,0,1]
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm12[3,1],xmm14[3,3]
-; AVX-NEXT: vmovaps %ymm9, %ymm3
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,1],ymm8[2,1],ymm9[7,5],ymm8[6,5]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm8[3,1],xmm1[3,3]
+; AVX-NEXT: vmovaps %ymm5, %ymm4
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm6[2,1],ymm5[7,5],ymm6[6,5]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,1],ymm10[3,1],ymm11[4,5],ymm10[7,5]
-; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = xmm5[3,1],mem[3,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm10[3,1],ymm13[4,5],ymm10[7,5]
+; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm11[3,1],mem[3,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,1],ymm15[2,1],ymm11[7,5],ymm15[6,5]
+; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT: # ymm2 = ymm15[3,1],mem[2,1],ymm15[7,5],mem[6,5]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 416(%rdi), %xmm0
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX-NEXT: vmovaps 400(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT: vmovapd 464(%rdi), %xmm1
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4]
+; AVX-NEXT: vmovaps 448(%rdi), %xmm3
+; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 464(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,0],ymm7[4,5],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm2[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm9[0,0],ymm2[6,4],ymm9[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm1[2,0],ymm9[4,6],ymm1[6,4]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm3[0,0],ymm2[6,4],ymm3[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm1[2,0],ymm3[4,6],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT: # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX-NEXT: vmovaps 32(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX-NEXT: vmovaps 16(%rdi), %xmm14
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3]
-; AVX-NEXT: vmovapd 80(%rdi), %xmm5
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[1],ymm7[0],ymm5[2],ymm7[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,0],ymm13[4,5],ymm1[6,4]
+; AVX-NEXT: vmovaps 16(%rdi), %xmm13
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3]
+; AVX-NEXT: vmovaps 64(%rdi), %xmm12
+; AVX-NEXT: vmovaps 80(%rdi), %xmm10
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm10[2,0],xmm12[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,0],ymm14[4,5],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm1[0,0],ymm4[6,4],ymm1[4,4]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm11[2,0],ymm1[0,0],ymm11[6,4],ymm1[4,4]
; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,2],ymm2[2,0],ymm1[4,6],ymm2[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX-NEXT: vmovaps 224(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX-NEXT: vmovaps 208(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
-; AVX-NEXT: vmovapd 272(%rdi), %xmm2
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm8[0],ymm2[2],ymm8[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4]
+; AVX-NEXT: vmovaps 256(%rdi), %xmm5
+; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 272(%rdi), %xmm2
+; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm5[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,0],ymm2[0,0],ymm6[6,4],ymm2[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,2],ymm3[2,0],ymm2[4,6],ymm3[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm9[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[2,0],ymm2[0,0],ymm9[6,4],ymm2[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm2[0,2],ymm4[2,0],ymm2[4,6],ymm4[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
-; AVX-NEXT: vmovaps 608(%rdi), %xmm13
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm13[2,2,3,3]
-; AVX-NEXT: vmovaps 592(%rdi), %xmm8
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3]
-; AVX-NEXT: vmovapd 656(%rdi), %xmm10
-; AVX-NEXT: vmovaps %ymm15, %ymm3
-; AVX-NEXT: vshufpd {{.*#+}} ymm15 = ymm10[1],ymm15[0],ymm10[2],ymm15[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,1],ymm15[2,0],ymm11[4,5],ymm15[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,0],ymm0[0,0],ymm12[6,4],ymm0[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,2],ymm11[2,0],ymm0[4,6],ymm11[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,0],ymm1[1,0],ymm4[7,4],ymm1[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm7[2,0],ymm1[4,7],ymm7[6,4]
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT: # xmm7 = xmm14[0,1],mem[2,3]
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT: # ymm5 = ymm5[3,1],mem[1,3],ymm5[7,5],mem[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,1],ymm5[2,0],ymm4[5,5],ymm5[6,4]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT: # ymm4 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT: vmovaps 608(%rdi), %xmm8
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm8[2,2,3,3]
+; AVX-NEXT: vmovaps 592(%rdi), %xmm6
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2,3]
+; AVX-NEXT: vmovaps 640(%rdi), %xmm7
+; AVX-NEXT: vmovaps 656(%rdi), %xmm5
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm5[2,0],xmm7[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,0],ymm15[4,5],ymm14[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,0],ymm0[0,0],ymm4[6,4],ymm0[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm0[0,2],ymm15[2,0],ymm0[4,6],ymm15[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm1[1,0],ymm11[7,4],ymm1[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm11[2,0],ymm1[4,7],ymm11[6,4]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = xmm13[0,1],mem[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],xmm12[1,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm9[1,0],ymm1[7,4],ymm9[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,3],ymm1[2,0],ymm9[4,7],ymm1[6,4]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vblendps $12, (%rsp), %xmm4, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm4[0,1],mem[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT: # ymm9 = ymm4[3,1],mem[1,3],ymm4[7,5],mem[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm4[1,1],ymm9[2,0],ymm4[5,5],ymm9[6,4]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm2[1,0],ymm6[7,4],ymm2[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm3[1,0],ymm1[7,4],ymm3[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,3],ymm1[2,0],ymm3[4,7],ymm1[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = xmm3[0,1],mem[2,3]
+; AVX-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = xmm10[3,0],mem[1,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm10[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm2[1,0],ymm9[7,4],ymm2[5,4]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,3],ymm1[2,0],ymm2[4,7],ymm1[6,4]
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT: # ymm6 = ymm4[3,1],mem[1,3],ymm4[7,5],mem[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1],ymm6[2,0],ymm4[5,5],ymm6[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX-NEXT: # xmm9 = xmm9[3,0],mem[1,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,1],ymm9[2,0],ymm10[5,5],ymm9[6,4]
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm9[2,3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm12[3,0],ymm0[1,0],ymm12[7,4],ymm0[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,0],ymm0[1,0],ymm4[7,4],ymm0[5,4]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm2[2,0],ymm0[4,7],ymm2[6,4]
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm13[2,3]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm10[3,1],ymm3[1,3],ymm10[7,5],ymm3[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1],ymm3[2,0],ymm4[5,5],ymm3[6,4]
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm8[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[3,0],xmm7[1,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1],ymm4[2,0],ymm5[5,5],ymm4[6,4]
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm2, 96(%rsi)
@@ -4414,7 +4464,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %ymm2, 64(%r8)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm2, (%r8)
-; AVX-NEXT: vmovaps %ymm11, 96(%r9)
+; AVX-NEXT: vmovaps %ymm14, 96(%r9)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm2, 32(%r9)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -4424,9 +4474,9 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: vmovaps %ymm0, 96(%rax)
; AVX-NEXT: vmovaps %ymm1, 32(%rax)
-; AVX-NEXT: vmovaps %ymm5, 64(%rax)
-; AVX-NEXT: vmovaps %ymm7, (%rax)
-; AVX-NEXT: addq $1048, %rsp # imm = 0x418
+; AVX-NEXT: vmovaps %ymm3, 64(%rax)
+; AVX-NEXT: vmovaps %ymm11, (%rax)
+; AVX-NEXT: addq $1032, %rsp # imm = 0x408
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -8032,52 +8082,54 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX-LABEL: load_i32_stride6_vf64:
; AVX: # %bb.0:
-; AVX-NEXT: subq $2616, %rsp # imm = 0xA38
+; AVX-NEXT: subq $2472, %rsp # imm = 0x9A8
; AVX-NEXT: vmovaps 608(%rdi), %ymm6
; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 576(%rdi), %ymm7
; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 352(%rdi), %ymm4
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 320(%rdi), %ymm5
+; AVX-NEXT: vmovapd 352(%rdi), %ymm3
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 320(%rdi), %ymm9
; AVX-NEXT: vmovaps 288(%rdi), %ymm2
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 256(%rdi), %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 224(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 192(%rdi), %ymm1
+; AVX-NEXT: vmovaps 256(%rdi), %ymm5
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 224(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX-NEXT: vmovaps 192(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm13
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm13[0,3]
-; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm3, %ymm1
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm10
+; AVX-NEXT: vmovaps 192(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm10[0,3]
+; AVX-NEXT: vinsertf128 $1, 288(%rdi), %ymm5, %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[2,2],ymm1[6,4],ymm5[6,6]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3],ymm3[0,1]
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[3],ymm5[2]
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[3],ymm9[2]
+; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm11
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[0,3]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm8
+; AVX-NEXT: vmovaps 576(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,3]
; AVX-NEXT: vmovaps 672(%rdi), %ymm2
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 640(%rdi), %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 672(%rdi), %ymm3, %ymm1
+; AVX-NEXT: vmovaps 640(%rdi), %ymm11
+; AVX-NEXT: vinsertf128 $1, 672(%rdi), %ymm11, %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm11[2,2],ymm1[6,4],ymm11[6,6]
+; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX-NEXT: vmovapd 736(%rdi), %ymm1
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8089,23 +8141,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 992(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 960(%rdi), %ymm1
+; AVX-NEXT: vmovaps 992(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX-NEXT: vmovaps 960(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm8
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX-NEXT: vmovaps 960(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,3]
; AVX-NEXT: vmovaps 1056(%rdi), %ymm2
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1024(%rdi), %ymm12
-; AVX-NEXT: vinsertf128 $1, 1056(%rdi), %ymm12, %ymm1
+; AVX-NEXT: vmovaps 1024(%rdi), %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, 1056(%rdi), %ymm3, %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm12[2,2],ymm1[6,4],ymm12[6,6]
-; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX-NEXT: vmovapd 1120(%rdi), %ymm1
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8117,23 +8170,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1376(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1344(%rdi), %ymm1
+; AVX-NEXT: vmovaps 1376(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX-NEXT: vmovaps 1344(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX-NEXT: vmovaps 1344(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,3]
; AVX-NEXT: vmovaps 1440(%rdi), %ymm2
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1408(%rdi), %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 1440(%rdi), %ymm3, %ymm1
+; AVX-NEXT: vmovaps 1408(%rdi), %ymm12
+; AVX-NEXT: vinsertf128 $1, 1440(%rdi), %ymm12, %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm12[2,2],ymm1[6,4],ymm12[6,6]
+; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX-NEXT: vmovapd 1504(%rdi), %ymm1
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8145,23 +8199,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 32(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps (%rdi), %ymm1
+; AVX-NEXT: vmovaps 32(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX-NEXT: vmovaps (%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX-NEXT: vmovaps (%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,3]
; AVX-NEXT: vmovaps 96(%rdi), %ymm2
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 64(%rdi), %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm3, %ymm1
+; AVX-NEXT: vmovaps 64(%rdi), %ymm6
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, 96(%rdi), %ymm6, %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,0],ymm2[6,4],ymm1[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[2,2],ymm1[6,4],ymm6[6,6]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX-NEXT: vmovapd 160(%rdi), %ymm1
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8173,193 +8228,198 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 416(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 384(%rdi), %ymm1
+; AVX-NEXT: vmovaps 416(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX-NEXT: vmovaps 384(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX-NEXT: vmovaps 384(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,3]
; AVX-NEXT: vmovaps 480(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 448(%rdi), %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm3, %ymm5
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm5[0,0],ymm1[6,4],ymm5[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,2],ymm1[6,4],ymm3[6,6]
+; AVX-NEXT: vmovaps 448(%rdi), %ymm6
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, 480(%rdi), %ymm6, %ymm7
+; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm7[0,0],ymm1[6,4],ymm7[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[2,2],ymm1[6,4],ymm6[6,6]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX-NEXT: vmovapd 544(%rdi), %ymm1
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 512(%rdi), %ymm3
-; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm1[0,1]
-; AVX-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[3],ymm3[2]
+; AVX-NEXT: vmovapd 512(%rdi), %ymm6
+; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3],ymm1[0,1]
+; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[3],ymm6[2]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 800(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 768(%rdi), %ymm1
+; AVX-NEXT: vmovaps 800(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm10, %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1],xmm1[2,3]
+; AVX-NEXT: vmovaps 768(%rdi), %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovaps 768(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,3]
-; AVX-NEXT: vmovaps 864(%rdi), %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 832(%rdi), %ymm5
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 864(%rdi), %ymm5, %ymm7
+; AVX-NEXT: vmovaps 864(%rdi), %ymm6
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 832(%rdi), %ymm7
; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm7[0,0],ymm3[6,4],ymm7[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm5[2,2],ymm3[6,4],ymm5[6,6]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7]
-; AVX-NEXT: vmovapd 928(%rdi), %ymm3
-; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 896(%rdi), %ymm5
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3],ymm3[0,1]
-; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[3],ymm5[2]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT: vinsertf128 $1, 864(%rdi), %ymm7, %ymm13
+; AVX-NEXT: vmovups %ymm13, (%rsp) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm13[0,0],ymm6[6,4],ymm13[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm7[2,2],ymm6[6,4],ymm7[6,6]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7]
+; AVX-NEXT: vmovapd 928(%rdi), %ymm6
+; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 896(%rdi), %ymm7
+; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[0,1]
+; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[3],ymm7[2]
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1184(%rdi), %ymm0
+; AVX-NEXT: vmovaps 1184(%rdi), %ymm6
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 1152(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1152(%rdi), %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
-; AVX-NEXT: vextractf128 $1, %ymm5, %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1],xmm0[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,3]
-; AVX-NEXT: vmovaps 1248(%rdi), %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7]
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX-NEXT: vmovaps 1152(%rdi), %xmm6
+; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,3]
+; AVX-NEXT: vmovaps 1248(%rdi), %ymm13
+; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 1216(%rdi), %ymm7
; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vinsertf128 $1, 1248(%rdi), %ymm7, %ymm14
; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,0],ymm14[0,0],ymm3[6,4],ymm14[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm13[2,0],ymm14[0,0],ymm13[6,4],ymm14[4,4]
; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0],ymm7[2,2],ymm15[6,4],ymm7[6,6]
-; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm9[0,1,2],ymm15[3,4,5,6,7]
-; AVX-NEXT: vmovapd 1312(%rdi), %ymm3
-; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1280(%rdi), %ymm7
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[2,3],ymm3[0,1]
-; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm3[0],ymm7[1],ymm3[3],ymm7[2]
+; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm6[0,1,2],ymm15[3,4,5,6,7]
+; AVX-NEXT: vmovapd 1312(%rdi), %ymm6
+; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 1280(%rdi), %ymm13
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm13[2,3],ymm6[0,1]
+; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm6[0],ymm13[1],ymm6[3],ymm13[2]
+; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm14[6,7]
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm3[1,0],xmm13[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm14[0,2],xmm13[1,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT: # ymm14 = ymm3[3,0],mem[1,0],ymm3[7,4],mem[5,4]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm3[2,3],ymm14[6,4],ymm3[6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
+; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm10[3,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm14[2,0],xmm10[1,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT: # ymm14 = ymm15[3,0],mem[1,0],ymm15[7,4],mem[5,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm6[2,3],ymm14[6,4],ymm6[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT: # ymm14 = ymm7[3,1],mem[1,3],ymm7[7,5],mem[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm7[3,1],ymm9[1,3],ymm7[7,5],ymm9[5,7]
; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4,5],ymm14[6,7]
-; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm7[1,0],xmm11[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm13[0,2],xmm11[1,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7]
+; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = xmm8[3,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm8[1,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = ymm14[3,0],mem[1,0],ymm14[7,4],mem[5,4]
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm10 # 32-byte Folded Reload
+; AVX-NEXT: # ymm10 = ymm14[3,0],mem[1,0],ymm14[7,4],mem[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0],ymm11[2,3],ymm10[6,4],ymm11[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0],ymm7[2,3],ymm13[6,4],ymm7[6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1],ymm9[1,3],ymm13[7,5],ymm9[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
-; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,0],xmm8[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm11[0,2],xmm8[1,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm11 # 32-byte Folded Reload
-; AVX-NEXT: # ymm11 = ymm13[3,0],mem[1,0],ymm13[7,4],mem[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0],ymm12[2,3],ymm11[6,4],ymm12[6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm11[3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,1],ymm12[1,3],ymm11[7,5],ymm12[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm9[3,1],ymm7[1,3],ymm9[7,5],ymm7[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],xmm6[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm8[0,2],xmm6[1,3]
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = xmm5[3,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm8[2,0],xmm5[1,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload
+; AVX-NEXT: # ymm8 = ymm10[3,0],mem[1,0],ymm10[7,4],mem[5,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm9[2,3],ymm8[6,4],ymm9[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3,4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT: # ymm8 = ymm11[3,0],mem[1,0],ymm11[7,4],mem[5,4]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm15[2,3],ymm8[6,4],ymm15[6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT: # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,1],ymm11[1,3],ymm8[7,5],ymm11[5,7]
; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
-; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0],xmm4[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm6[0,2],xmm4[1,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = xmm4[3,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[1,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT: # ymm6 = ymm8[3,0],mem[1,0],ymm8[7,4],mem[5,4]
-; AVX-NEXT: vshufps $226, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT: # ymm6 = ymm6[2,0],mem[2,3],ymm6[6,4],mem[6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT: # ymm6 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = ymm8[3,0],mem[1,0],ymm8[7,4],mem[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm12[2,3],ymm5[6,4],ymm12[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1],ymm12[1,3],ymm5[7,5],ymm12[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm2[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm4[0,2],xmm2[1,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT: # ymm4 = ymm4[3,0],mem[1,0],ymm4[7,4],mem[5,4]
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = xmm3[3,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[1,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT: # ymm4 = ymm5[3,0],mem[1,0],ymm5[7,4],mem[5,4]
; AVX-NEXT: vshufps $226, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX-NEXT: # ymm4 = ymm4[2,0],mem[2,3],ymm4[6,4],mem[6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1],ymm6[1,3],ymm4[7,5],ymm6[5,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT: # ymm4 = ymm4[3,1],mem[1,3],ymm4[7,5],mem[5,7]
; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = xmm2[3,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[1,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = ymm3[3,0],mem[1,0],ymm3[7,4],mem[5,4]
+; AVX-NEXT: vshufps $226, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = ymm3[2,0],mem[2,3],ymm3[6,4],mem[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1],ymm4[1,3],ymm3[7,5],ymm4[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,0],xmm1[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[1,3]
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm1[3,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[1,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT: vshufps $19, (%rsp), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX-NEXT: # ymm2 = ymm2[3,0],mem[1,0],ymm2[7,4],mem[5,4]
; AVX-NEXT: vshufps $226, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
; AVX-NEXT: # ymm2 = ymm2[2,0],mem[2,3],ymm2[6,4],mem[6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm4[1,3],ymm2[7,5],ymm4[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm3[1,3],ymm2[7,5],ymm3[5,7]
; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,0],xmm0[3,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[1,3]
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm0[3,0],mem[1,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[1,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX-NEXT: # ymm1 = ymm1[3,0],mem[1,0],ymm1[7,4],mem[5,4]
@@ -8367,8 +8427,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: # ymm1 = ymm1[2,0],mem[2,3],ymm1[6,4],mem[6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT: # ymm1 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm13[1,3],ymm1[7,5],ymm13[5,7]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8376,34 +8435,33 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm3[2,0],ymm0[6,5],ymm3[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm15[2,1],ymm6[2,0],ymm15[6,5],ymm6[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT: # ymm2 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
+; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT: # ymm15 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3,0,1]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm15[2,0],ymm1[4,4],ymm15[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[2,1],ymm7[2,0],ymm14[6,5],ymm7[6,4]
+; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT: # ymm0 = ymm14[2,1],mem[2,0],ymm14[6,5],mem[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT: # ymm2 = ymm9[0,1,2,3],mem[4,5],ymm9[6,7]
+; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT: # ymm2 = ymm7[0,1,2,3],mem[4,5],ymm7[6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8414,18 +8472,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload
-; AVX-NEXT: # ymm0 = ymm13[2,1],mem[2,0],ymm13[6,5],mem[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,1],ymm9[2,0],ymm10[6,5],ymm9[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT: # ymm2 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7]
+; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT: # ymm2 = ymm11[0,1,2,3],mem[4,5],ymm11[6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[2,0],ymm1[4,4],ymm2[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8433,26 +8490,26 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm11[2,1],ymm15[2,0],ymm11[6,5],ymm15[6,4]
+; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT: # ymm0 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
-; AVX-NEXT: # ymm10 = ymm1[0,1,2,3],mem[4,5],ymm1[6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm10[2,3,0,1]
+; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT: # ymm11 = ymm12[0,1,2,3],mem[4,5],ymm12[6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm11[2,3,0,1]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm10[2,0],ymm1[4,4],ymm10[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm11[2,0],ymm1[4,4],ymm11[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
; AVX-NEXT: # ymm2 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
-; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
-; AVX-NEXT: # ymm0 = ymm8[2,1],mem[2,0],ymm8[6,5],mem[6,4]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT: # ymm0 = ymm5[2,1],mem[2,0],ymm5[6,5],mem[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -8479,91 +8536,90 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT: # ymm9 = ymm6[0,1,2,3],mem[4,5],ymm6[6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[2,3,0,1]
+; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm8 # 32-byte Folded Reload
+; AVX-NEXT: # ymm8 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm9[2,0],ymm1[4,4],ymm9[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm8[2,0],ymm1[4,4],ymm8[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT: # ymm8 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT: # ymm7 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vshufps $38, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX-NEXT: # ymm0 = ymm0[2,1],mem[2,0],ymm0[6,5],mem[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX-NEXT: vextractf128 $1, %ymm8, %xmm1
+; AVX-NEXT: vextractf128 $1, %ymm7, %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,0],xmm1[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,0],xmm1[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT: # ymm6 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm6[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0],ymm6[2,0],ymm13[4,4],ymm6[6,4]
+; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT: # ymm4 = ymm3[0,1,2,3],mem[4,5],ymm3[6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm4[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,0],ymm4[2,0],ymm9[4,4],ymm4[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT: # ymm5 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT: # ymm1 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,1],ymm3[2,0],ymm4[6,5],ymm3[6,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,1],ymm6[2,0],ymm3[6,5],ymm6[6,4]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1]
-; AVX-NEXT: vextractf128 $1, %ymm5, %xmm7
-; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm5[2,0],xmm7[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm1[2,0],xmm5[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3,4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
-; AVX-NEXT: # ymm15 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm15[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,0],ymm15[2,0],ymm2[4,4],ymm15[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7]
+; AVX-NEXT: vblendps $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm13 # 32-byte Folded Reload
+; AVX-NEXT: # ymm13 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm13[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,0],ymm13[2,0],ymm2[4,4],ymm13[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm15[3,1],ymm0[4,5],ymm15[7,5]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = xmm10[3,1],mem[3,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT: # ymm14 = ymm15[3,1],mem[2,1],ymm15[7,5],mem[6,5]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,0,1]
+; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT: # xmm11 = xmm1[3,1],mem[3,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = xmm10[3,1],mem[3,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm1[3,1],ymm14[2,1],ymm1[7,5],ymm14[6,5]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7]
+; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT: # ymm14 = ymm14[3,1],mem[2,1],ymm14[7,5],mem[6,5]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,0,1]
+; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT: # xmm11 = xmm1[3,1],mem[3,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm12[3,1],mem[2,1],ymm12[7,5],mem[6,5]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT: # ymm1 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT: # xmm11 = xmm0[3,1],mem[3,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm12[3,1],mem[2,1],ymm12[7,5],mem[6,5]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = xmm10[3,1],mem[3,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm12 # 32-byte Folded Reload
+; AVX-NEXT: # ymm12 = ymm14[3,1],mem[2,1],ymm14[7,5],mem[6,5]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm12[2,3,0,1]
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm12[3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm10[3,1],ymm0[4,5],ymm10[7,5]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm10 # 16-byte Folded Reload
-; AVX-NEXT: # xmm10 = xmm1[3,1],mem[3,3]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm11[3,1],ymm0[4,5],ymm11[7,5]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = xmm10[3,1],mem[3,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
; AVX-NEXT: # ymm11 = ymm11[3,1],mem[2,1],ymm11[7,5],mem[6,5]
@@ -8571,32 +8627,31 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm15[3,1],ymm2[4,5],ymm15[7,5]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,1],xmm7[3,3]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm3[2,1],ymm4[7,5],ymm3[6,5]
-; AVX-NEXT: vmovaps %ymm4, %ymm15
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm13[3,1],ymm2[4,5],ymm13[7,5]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[3,3]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm6[2,1],ymm3[7,5],ymm6[6,5]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,1],ymm6[3,1],ymm13[4,5],ymm6[7,5]
-; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = xmm8[3,1],mem[3,3]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,1],ymm4[3,1],ymm9[4,5],ymm4[7,5]
+; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm7[3,1],mem[3,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm6[2,1],ymm3[7,5],ymm6[6,5]
+; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT: # ymm2 = ymm3[3,1],mem[2,1],ymm3[7,5],mem[6,5]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm9[3,1],ymm0[4,5],ymm9[7,5]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm8[3,1],ymm0[4,5],ymm8[7,5]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX-NEXT: # xmm1 = xmm1[3,1],mem[3,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm8[2,1],ymm4[7,5],ymm8[6,5]
+; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT: # ymm2 = ymm4[3,1],mem[2,1],ymm4[7,5],mem[6,5]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
@@ -8604,34 +8659,35 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vshufps $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX-NEXT: # ymm0 = ymm0[0,1],mem[3,1],ymm0[4,5],mem[7,5]
-; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vshufps $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX-NEXT: # xmm1 = xmm1[3,1],mem[3,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm7[2,1],ymm5[7,5],ymm7[6,5]
+; AVX-NEXT: vshufps $103, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT: # ymm2 = ymm5[3,1],mem[2,1],ymm5[7,5],mem[6,5]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX-NEXT: vmovaps 32(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX-NEXT: vmovaps 16(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT: vmovapd 80(%rdi), %xmm1
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm7[0],ymm1[2],ymm7[3]
+; AVX-NEXT: vmovaps 64(%rdi), %xmm2
+; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 80(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,0]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm2[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm11[0,0],ymm2[6,4],ymm11[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,2],ymm1[2,0],ymm11[4,6],ymm1[6,4]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm9[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[2,0],ymm10[0,0],ymm9[6,4],ymm10[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm1[2,0],ymm10[4,6],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -8644,11 +8700,12 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps 208(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT: vmovapd 272(%rdi), %xmm1
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm14[0],ymm1[2],ymm14[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,0],ymm5[4,5],ymm1[6,4]
+; AVX-NEXT: vmovaps 256(%rdi), %xmm5
+; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 272(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,0],ymm15[4,5],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm7[0,0],ymm2[6,4],ymm7[4,4]
@@ -8665,9 +8722,11 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps 400(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT: vmovapd 464(%rdi), %xmm1
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm8[0],ymm1[2],ymm8[3]
+; AVX-NEXT: vmovaps 448(%rdi), %xmm5
+; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 464(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[0,0]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,0],ymm4[4,5],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1]
@@ -8684,10 +8743,11 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps 592(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT: vmovapd 656(%rdi), %xmm1
-; AVX-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill
-; AVX-NEXT: vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
+; AVX-NEXT: vmovaps 640(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 656(%rdi), %xmm2
+; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[0,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
@@ -8697,42 +8757,45 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX-NEXT: # ymm10 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX-NEXT: vmovaps 800(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX-NEXT: vmovaps 784(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT: vmovapd 848(%rdi), %xmm1
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm6[0],ymm1[2],ymm6[3]
+; AVX-NEXT: vmovaps 832(%rdi), %xmm2
+; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 848(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,0]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,0],ymm3[4,5],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm10[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[2,0],ymm3[0,0],ymm10[6,4],ymm3[4,4]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[2,0],ymm3[0,0],ymm11[6,4],ymm3[4,4]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm1[2,0],ymm3[4,6],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
+; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7]
; AVX-NEXT: vmovaps 992(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
; AVX-NEXT: vmovaps 976(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT: vmovapd 1040(%rdi), %xmm1
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
+; AVX-NEXT: vmovaps 1024(%rdi), %xmm2
+; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovaps 1040(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,0],ymm14[4,5],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm9[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[2,0],ymm2[0,0],ymm9[6,4],ymm2[4,4]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm6[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[2,0],ymm2[0,0],ymm6[6,4],ymm2[4,4]
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8745,15 +8808,17 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps 1168(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
-; AVX-NEXT: vmovapd 1232(%rdi), %xmm1
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT: # ymm1 = ymm1[1],mem[0],ymm1[2],mem[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,0],ymm15[4,5],ymm1[6,4]
+; AVX-NEXT: vmovaps 1216(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 1232(%rdi), %xmm6
+; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,0],xmm1[0,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm1[0,0],ymm8[6,4],ymm1[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,2],ymm6[2,0],ymm1[4,6],ymm6[6,4]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm8[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm8[2,0],ymm13[0,0],ymm8[6,4],ymm13[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm13[0,2],ymm6[2,0],ymm13[4,6],ymm6[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -8762,125 +8827,126 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps 1376(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3]
-; AVX-NEXT: vmovaps 1360(%rdi), %xmm13
-; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3]
-; AVX-NEXT: vmovapd 1424(%rdi), %xmm13
-; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd $9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
-; AVX-NEXT: # ymm15 = ymm13[1],mem[0],ymm13[2],mem[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,1],ymm15[2,0],ymm13[4,5],ymm15[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2,3,4,5,6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm6[2,0],ymm0[0,0],ymm6[6,4],ymm0[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,2],ymm13[2,0],ymm0[4,6],ymm13[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5,6,7]
-; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0],ymm11[1,0],ymm13[7,4],ymm11[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,3],ymm13[2,0],ymm11[4,7],ymm13[6,4]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
-; AVX-NEXT: # xmm13 = xmm13[0,1],mem[2,3]
+; AVX-NEXT: vmovaps 1360(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT: vmovaps 1408(%rdi), %xmm14
+; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 1424(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm0[2,0],xmm14[0,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX-NEXT: # ymm15 = ymm15[3,1],mem[1,3],ymm15[7,5],mem[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4]
-; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0],ymm7[1,0],ymm13[7,4],ymm7[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,3],ymm13[2,0],ymm7[4,7],ymm13[6,4]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
-; AVX-NEXT: # xmm13 = xmm13[0,1],mem[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload
-; AVX-NEXT: # ymm15 = ymm14[3,1],mem[1,3],ymm14[7,5],mem[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4]
-; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3,4],ymm7[5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0],ymm5[1,0],ymm13[7,4],ymm5[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm13[2,0],ymm5[4,7],ymm13[6,4]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
-; AVX-NEXT: # xmm13 = xmm13[0,1],mem[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload
-; AVX-NEXT: # ymm15 = ymm14[3,1],mem[1,3],ymm14[7,5],mem[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4]
-; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,0],ymm15[4,5],ymm14[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm6[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm0[0,0],ymm6[6,4],ymm0[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm0[0,2],ymm15[2,0],ymm0[4,6],ymm15[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7]
+; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm9[3,0],ymm10[1,0],ymm9[7,4],ymm10[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,3],ymm14[2,0],ymm10[4,7],ymm14[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm10[0,1],mem[2,3]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = xmm10[3,0],mem[1,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4],ymm9[5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm10[3,0],ymm7[1,0],ymm10[7,4],ymm7[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,3],ymm14[2,0],ymm7[4,7],ymm14[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm10[0,1],mem[2,3]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = xmm10[3,0],mem[1,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm10[3,0],ymm5[1,0],ymm10[7,4],ymm5[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm14[2,0],ymm5[4,7],ymm14[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm10[0,1],mem[2,3]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = xmm10[3,0],mem[1,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm15[2,0],ymm10[5,5],ymm15[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2,3,4],ymm5[5,6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4]
; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,3],ymm12[2,0],ymm4[4,7],ymm12[6,4]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = xmm12[0,1],mem[2,3]
-; AVX-NEXT: vmovups (%rsp), %ymm13 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = ymm13[3,1],mem[1,3],ymm13[7,5],mem[5,7]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload
+; AVX-NEXT: # xmm12 = xmm10[0,1],mem[2,3]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm10[3,0],mem[1,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm15[1,1],ymm13[2,0],ymm15[5,5],ymm13[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1],ymm14[2,0],ymm15[5,5],ymm14[6,4]
; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0],ymm3[1,0],ymm10[7,4],ymm3[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm10[2,0],ymm3[4,7],ymm10[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm11[2,0],ymm3[4,7],ymm11[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = xmm10[0,1],mem[2,3]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm12 # 16-byte Folded Reload
+; AVX-NEXT: # xmm12 = xmm10[3,0],mem[1,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm10[1,1],ymm12[2,0],ymm10[5,5],ymm12[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,0],ymm2[1,0],ymm1[7,4],ymm2[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm10[2,0],ymm2[4,7],ymm10[6,4]
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
; AVX-NEXT: # xmm10 = xmm10[0,1],mem[2,3]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX-NEXT: vshufps $19, (%rsp), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = xmm11[3,0],mem[1,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm12[3,1],mem[1,3],ymm12[7,5],mem[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1],ymm12[2,0],ymm13[5,5],ymm12[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm12[1,1],ymm11[2,0],ymm12[5,5],ymm11[6,4]
; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4],ymm3[5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0],ymm2[1,0],ymm9[7,4],ymm2[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,3],ymm9[2,0],ymm2[4,7],ymm9[6,4]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX-NEXT: # xmm9 = xmm9[0,1],mem[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX-NEXT: # ymm10 = ymm10[3,1],mem[1,3],ymm10[7,5],mem[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1],ymm10[2,0],ymm12[5,5],ymm10[6,4]
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5,6,7]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm1[1,0],ymm8[7,4],ymm1[5,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,3],ymm8[2,0],ymm1[4,7],ymm8[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm13[1,0],ymm8[7,4],ymm13[5,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,3],ymm8[2,0],ymm13[4,7],ymm8[6,4]
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
; AVX-NEXT: # xmm8 = xmm8[0,1],mem[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT: # ymm9 = ymm9[3,1],mem[1,3],ymm9[7,5],mem[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,1],ymm9[2,0],ymm10[5,5],ymm9[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = xmm10[3,0],mem[1,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,1],ymm10[2,0],ymm11[5,5],ymm10[6,4]
; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2,3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm0[1,0],ymm6[7,4],ymm0[5,4]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm6[2,0],ymm0[4,7],ymm6[6,4]
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
; AVX-NEXT: # xmm6 = xmm6[0,1],mem[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT: # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,1],ymm8[2,0],ymm9[5,5],ymm8[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vshufps $19, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = xmm8[3,0],mem[1,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[1,1],ymm8[2,0],ymm10[5,5],ymm8[6,4]
; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7]
@@ -8972,8 +9038,8 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %ymm4, 96(%rax)
; AVX-NEXT: vmovaps %ymm5, 64(%rax)
; AVX-NEXT: vmovaps %ymm7, 32(%rax)
-; AVX-NEXT: vmovaps %ymm11, (%rax)
-; AVX-NEXT: addq $2616, %rsp # imm = 0xA38
+; AVX-NEXT: vmovaps %ymm9, (%rax)
+; AVX-NEXT: addq $2472, %rsp # imm = 0x9A8
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index 7948141f6becd..16b50a5da694a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -173,9 +173,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-FCP-NEXT: vmovsd {{.*#+}} xmm4 = [4,3,0,0]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm4
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
+; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm7 = [5,4,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
+; AVX2-FCP-NEXT: vpermps %ymm8, %ymm7, %ymm7
; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
@@ -193,33 +193,30 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512: # %bb.0:
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512-NEXT: vmovd %xmm1, %r11d
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm4
-; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512-NEXT: vpermps (%rdi), %zmm1, %zmm1
-; AVX512-NEXT: vmovaps (%rdi), %ymm5
-; AVX512-NEXT: vmovaps 32(%rdi), %ymm6
-; AVX512-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX512-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512-NEXT: vmovq %xmm2, (%rsi)
-; AVX512-NEXT: vmovq %xmm3, (%rdx)
-; AVX512-NEXT: vmovq %xmm4, (%rcx)
-; AVX512-NEXT: vmovq %xmm0, (%r8)
-; AVX512-NEXT: vmovlps %xmm1, (%r9)
-; AVX512-NEXT: vmovlps %xmm7, (%r10)
-; AVX512-NEXT: vmovlps %xmm5, (%rax)
+; AVX512-NEXT: vmovaps (%rdi), %zmm0
+; AVX512-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3
+; AVX512-NEXT: vmovd %xmm2, %r11d
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX512-NEXT: vpinsrd $1, %r11d, %xmm4, %xmm4
+; AVX512-NEXT: vpbroadcastd 8(%rdi), %xmm5
+; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,11,0,0]
+; AVX512-NEXT: vpermps %zmm0, %zmm2, %zmm2
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,14,15]
+; AVX512-NEXT: vpermps %zmm0, %zmm6, %zmm6
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [6,13,6,7]
+; AVX512-NEXT: vpermps %zmm0, %zmm7, %zmm0
+; AVX512-NEXT: vmovq %xmm3, (%rsi)
+; AVX512-NEXT: vmovq %xmm4, (%rdx)
+; AVX512-NEXT: vmovq %xmm5, (%rcx)
+; AVX512-NEXT: vmovq %xmm1, (%r8)
+; AVX512-NEXT: vmovlps %xmm2, (%r9)
+; AVX512-NEXT: vmovlps %xmm6, (%r10)
+; AVX512-NEXT: vmovlps %xmm0, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -257,33 +254,30 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512DQ-NEXT: vmovd %xmm1, %r11d
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512DQ-NEXT: vpermps (%rdi), %zmm1, %zmm1
-; AVX512DQ-NEXT: vmovaps (%rdi), %ymm5
-; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm6
-; AVX512DQ-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX512DQ-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512DQ-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
-; AVX512DQ-NEXT: vmovq %xmm4, (%rcx)
-; AVX512DQ-NEXT: vmovq %xmm0, (%r8)
-; AVX512DQ-NEXT: vmovlps %xmm1, (%r9)
-; AVX512DQ-NEXT: vmovlps %xmm7, (%r10)
-; AVX512DQ-NEXT: vmovlps %xmm5, (%rax)
+; AVX512DQ-NEXT: vmovaps (%rdi), %zmm0
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3
+; AVX512DQ-NEXT: vmovd %xmm2, %r11d
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX512DQ-NEXT: vpinsrd $1, %r11d, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpbroadcastd 8(%rdi), %xmm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,11,0,0]
+; AVX512DQ-NEXT: vpermps %zmm0, %zmm2, %zmm2
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,14,15]
+; AVX512DQ-NEXT: vpermps %zmm0, %zmm6, %zmm6
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [6,13,6,7]
+; AVX512DQ-NEXT: vpermps %zmm0, %zmm7, %zmm0
+; AVX512DQ-NEXT: vmovq %xmm3, (%rsi)
+; AVX512DQ-NEXT: vmovq %xmm4, (%rdx)
+; AVX512DQ-NEXT: vmovq %xmm5, (%rcx)
+; AVX512DQ-NEXT: vmovq %xmm1, (%r8)
+; AVX512DQ-NEXT: vmovlps %xmm2, (%r9)
+; AVX512DQ-NEXT: vmovlps %xmm6, (%r10)
+; AVX512DQ-NEXT: vmovlps %xmm0, (%rax)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -321,33 +315,30 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512BW-NEXT: vmovd %xmm1, %r11d
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512BW-NEXT: vpermps (%rdi), %zmm1, %zmm1
-; AVX512BW-NEXT: vmovaps (%rdi), %ymm5
-; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm6
-; AVX512BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512BW-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX512BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
-; AVX512BW-NEXT: vmovq %xmm4, (%rcx)
-; AVX512BW-NEXT: vmovq %xmm0, (%r8)
-; AVX512BW-NEXT: vmovlps %xmm1, (%r9)
-; AVX512BW-NEXT: vmovlps %xmm7, (%r10)
-; AVX512BW-NEXT: vmovlps %xmm5, (%rax)
+; AVX512BW-NEXT: vmovaps (%rdi), %zmm0
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3
+; AVX512BW-NEXT: vmovd %xmm2, %r11d
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX512BW-NEXT: vpinsrd $1, %r11d, %xmm4, %xmm4
+; AVX512BW-NEXT: vpbroadcastd 8(%rdi), %xmm5
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,11,0,0]
+; AVX512BW-NEXT: vpermps %zmm0, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,14,15]
+; AVX512BW-NEXT: vpermps %zmm0, %zmm6, %zmm6
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [6,13,6,7]
+; AVX512BW-NEXT: vpermps %zmm0, %zmm7, %zmm0
+; AVX512BW-NEXT: vmovq %xmm3, (%rsi)
+; AVX512BW-NEXT: vmovq %xmm4, (%rdx)
+; AVX512BW-NEXT: vmovq %xmm5, (%rcx)
+; AVX512BW-NEXT: vmovq %xmm1, (%r8)
+; AVX512BW-NEXT: vmovlps %xmm2, (%r9)
+; AVX512BW-NEXT: vmovlps %xmm6, (%r10)
+; AVX512BW-NEXT: vmovlps %xmm0, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -385,33 +376,30 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-BW-NEXT: vpinsrd $1, 28(%rdi), %xmm0, %xmm2
-; AVX512DQ-BW-NEXT: vmovd %xmm1, %r11d
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
-; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm3, %xmm3
-; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm4
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,11,0,0]
-; AVX512DQ-BW-NEXT: vpermps (%rdi), %zmm1, %zmm1
-; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm5
-; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm6
-; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0,2,3]
-; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
-; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
-; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx)
-; AVX512DQ-BW-NEXT: vmovq %xmm0, (%r8)
-; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%r9)
-; AVX512DQ-BW-NEXT: vmovlps %xmm7, (%r10)
-; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%rax)
+; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm0
+; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-BW-NEXT: vpinsrd $1, 28(%rdi), %xmm1, %xmm3
+; AVX512DQ-BW-NEXT: vmovd %xmm2, %r11d
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX512DQ-BW-NEXT: vpinsrd $1, %r11d, %xmm4, %xmm4
+; AVX512DQ-BW-NEXT: vpbroadcastd 8(%rdi), %xmm5
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm2[1],xmm5[2,3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [4,11,0,0]
+; AVX512DQ-BW-NEXT: vpermps %zmm0, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [5,12,14,15]
+; AVX512DQ-BW-NEXT: vpermps %zmm0, %zmm6, %zmm6
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [6,13,6,7]
+; AVX512DQ-BW-NEXT: vpermps %zmm0, %zmm7, %zmm0
+; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rsi)
+; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rdx)
+; AVX512DQ-BW-NEXT: vmovq %xmm5, (%rcx)
+; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r8)
+; AVX512DQ-BW-NEXT: vmovlps %xmm2, (%r9)
+; AVX512DQ-BW-NEXT: vmovlps %xmm6, (%r10)
+; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -535,17 +523,18 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovaps 32(%rdi), %xmm4
; AVX-NEXT: vmovaps 64(%rdi), %xmm5
; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3]
; AVX-NEXT: vmovaps 80(%rdi), %xmm6
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,0],mem[3,3]
-; AVX-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm6[2]
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm5[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3]
+; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0],xmm3[1],xmm4[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],mem[3,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[2]
+; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm5[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm3[2,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3]
; AVX-NEXT: vmovaps 96(%rdi), %xmm9
; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0],xmm5[1],xmm9[2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
@@ -560,14 +549,13 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4]
; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,0],xmm5[3,2]
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm9[3]
+; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm6[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm7[2,3]
; AVX-NEXT: vmovaps %xmm2, (%rsi)
-; AVX-NEXT: vmovaps %xmm7, (%rdx)
-; AVX-NEXT: vmovaps %xmm8, (%rcx)
+; AVX-NEXT: vmovaps %xmm8, (%rdx)
+; AVX-NEXT: vmovaps %xmm6, (%rcx)
; AVX-NEXT: vmovaps %xmm3, (%r8)
; AVX-NEXT: vmovaps %xmm4, (%r9)
; AVX-NEXT: vmovaps %xmm5, (%r10)
@@ -1148,8 +1136,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-LABEL: load_i32_stride7_vf8:
; AVX: # %bb.0:
; AVX-NEXT: vmovaps 160(%rdi), %ymm4
-; AVX-NEXT: vmovaps 128(%rdi), %ymm7
-; AVX-NEXT: vmovaps 64(%rdi), %ymm10
+; AVX-NEXT: vmovaps 128(%rdi), %ymm8
; AVX-NEXT: vmovaps 32(%rdi), %ymm0
; AVX-NEXT: vmovaps (%rdi), %ymm1
; AVX-NEXT: vmovaps 96(%rdi), %ymm12
@@ -1157,86 +1144,87 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm12[0],ymm2[0],ymm12[2],ymm2[2]
; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX-NEXT: vmovaps (%rdi), %xmm14
-; AVX-NEXT: vmovaps 32(%rdi), %xmm9
-; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
+; AVX-NEXT: vmovaps (%rdi), %xmm13
+; AVX-NEXT: vmovaps 32(%rdi), %xmm10
+; AVX-NEXT: vmovaps 64(%rdi), %xmm7
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2],ymm3[3,4,5,6,7]
; AVX-NEXT: vmovaps 160(%rdi), %xmm3
; AVX-NEXT: vmovaps 128(%rdi), %xmm5
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm5[1],xmm3[1]
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm5[1],xmm3[1]
; AVX-NEXT: vmovaps 192(%rdi), %xmm11
-; AVX-NEXT: vinsertps {{.*#+}} xmm8 = zero,xmm8[1,2],xmm11[1]
-; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm12[1,1],ymm10[2,2],ymm12[5,5],ymm10[6,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm9[0],xmm14[1],xmm9[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4,5,6,7]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm7[2,3],ymm4[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm7[0,0],ymm13[3,3],ymm7[4,4],ymm13[7,7]
-; AVX-NEXT: vextractf128 $1, %ymm13, %xmm13
-; AVX-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm13[1,2],xmm11[2]
-; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm14[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm9[1],xmm13[2,3]
-; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm2[3,1],ymm10[0,3],ymm2[7,5],ymm10[4,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1],ymm15[2,3,4,5,6,7]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm7[0],ymm4[0],ymm7[2],ymm4[2]
-; AVX-NEXT: vextractf128 $1, %ymm13, %xmm13
-; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3]
-; AVX-NEXT: vmovaps 192(%rdi), %ymm13
-; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT: vmovaps 64(%rdi), %xmm15
-; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm12[0,0],ymm10[5,4],ymm12[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm12[3,1],ymm10[0,2],ymm12[7,5],ymm10[4,6]
-; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0,1,2],xmm14[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm13[0,1],ymm4[1,3],ymm13[4,5],ymm4[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm7[0,2],ymm12[2,0],ymm7[4,6],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,0],ymm4[2,0],ymm13[5,4],ymm4[6,4]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm7[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm14[0,0],ymm7[7,4],ymm14[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0],ymm12[2,0],ymm7[6,4],ymm12[6,4]
-; AVX-NEXT: vmovaps 96(%rdi), %xmm12
-; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm12[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[2,1],ymm4[3,3],ymm13[6,5],ymm4[7,7]
-; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm3[0],xmm5[1],xmm3[2,3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm9[1,2],xmm11[1]
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm4[2,0],ymm9[5,4],ymm4[6,4]
-; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1,2],xmm15[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm12[1,1],mem[2,2],ymm12[5,5],mem[6,6]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm10[0],xmm13[1],xmm10[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,0],mem[3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3,4,5,6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm8[2,3],ymm4[0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm8[0,0],ymm14[3,3],ymm8[4,4],ymm14[7,7]
; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm14[2,0],xmm9[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
+; AVX-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm11[2]
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm10[1],xmm14[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm2[3,0],xmm7[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5,6,7]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm8[0],ymm4[0],ymm8[2],ymm4[2]
+; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
+; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm11[5,6,7]
+; AVX-NEXT: vmovaps 64(%rdi), %xmm14
+; AVX-NEXT: vmovaps 96(%rdi), %xmm15
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,0],xmm15[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm14[0,2],ymm12[7,5],ymm14[4,6]
+; AVX-NEXT: vmovaps 192(%rdi), %ymm14
+; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm10[0,1,2],xmm13[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm13[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm14[0,1],ymm4[1,3],ymm14[4,5],ymm4[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm8[0,2],ymm13[2,0],ymm8[4,6],ymm13[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm8[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm13[0,0],ymm8[7,4],ymm13[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm4[2,0],ymm14[5,4],ymm4[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm13[2,0],ymm8[6,4],ymm13[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm15[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm7[0,1,2],xmm13[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm14[2,1],ymm4[3,3],ymm14[6,5],ymm4[7,7]
+; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm3[0],xmm5[1],xmm3[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm10[1,0],ymm4[2,0],ymm10[5,4],ymm4[6,4]
+; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm15[0,1,2],xmm7[3]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm10[2,0],xmm7[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[3]
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm15[3]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm13[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[3,0],ymm1[0,0],ymm13[7,4],ymm1[4,4]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm14[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,0],ymm1[0,0],ymm14[7,4],ymm1[4,4]
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,0],ymm2[4,5],ymm1[6,4]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: vmovaps %ymm6, (%rsi)
-; AVX-NEXT: vmovaps %ymm8, (%rdx)
+; AVX-NEXT: vmovaps %ymm9, (%rdx)
; AVX-NEXT: vmovaps %ymm11, (%rcx)
-; AVX-NEXT: vmovaps %ymm10, (%r8)
-; AVX-NEXT: vmovaps %ymm7, (%r9)
+; AVX-NEXT: vmovaps %ymm12, (%r8)
+; AVX-NEXT: vmovaps %ymm8, (%r9)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: vmovaps %ymm4, (%rax)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
@@ -2326,234 +2314,233 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-LABEL: load_i32_stride7_vf16:
; AVX: # %bb.0:
; AVX-NEXT: subq $456, %rsp # imm = 0x1C8
-; AVX-NEXT: vmovaps 32(%rdi), %ymm4
+; AVX-NEXT: vmovaps 256(%rdi), %ymm4
; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps (%rdi), %ymm6
-; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 96(%rdi), %ymm15
-; AVX-NEXT: vmovaps 256(%rdi), %ymm2
+; AVX-NEXT: vmovaps 224(%rdi), %ymm5
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 320(%rdi), %ymm15
+; AVX-NEXT: vmovaps 32(%rdi), %ymm2
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 224(%rdi), %ymm1
+; AVX-NEXT: vmovaps (%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 320(%rdi), %ymm5
-; AVX-NEXT: vmovaps 304(%rdi), %xmm0
+; AVX-NEXT: vmovaps 96(%rdi), %ymm10
+; AVX-NEXT: vmovaps 80(%rdi), %xmm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[2],ymm0[2]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmovaps 224(%rdi), %xmm13
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3]
+; AVX-NEXT: vmovaps (%rdi), %xmm9
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vmovaps 384(%rdi), %xmm2
+; AVX-NEXT: vmovaps 160(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 352(%rdi), %xmm1
+; AVX-NEXT: vmovaps 128(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX-NEXT: vmovaps 416(%rdi), %xmm12
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm12[1]
+; AVX-NEXT: vmovaps 192(%rdi), %xmm3
+; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm3[1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 80(%rdi), %xmm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm4[6],ymm6[7]
+; AVX-NEXT: vmovaps 304(%rdi), %xmm13
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[2],ymm13[2]
+; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmovaps (%rdi), %xmm9
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
+; AVX-NEXT: vmovaps 224(%rdi), %xmm7
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vmovaps 160(%rdi), %xmm2
+; AVX-NEXT: vmovaps 384(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 128(%rdi), %xmm1
+; AVX-NEXT: vmovaps 352(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX-NEXT: vmovaps 192(%rdi), %xmm8
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1]
+; AVX-NEXT: vmovaps 416(%rdi), %xmm2
+; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 288(%rdi), %ymm6
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm6[2,2],ymm5[5,5],ymm6[6,6]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,1],mem[2,2],ymm10[5,5],mem[6,6]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT: vmovaps 256(%rdi), %xmm11
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0],xmm13[1],xmm11[2,3]
+; AVX-NEXT: vmovaps 32(%rdi), %xmm8
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0],xmm9[1],xmm8[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vmovaps 384(%rdi), %ymm7
-; AVX-NEXT: vmovaps 352(%rdi), %ymm1
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm7[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7]
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[2]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT: vmovaps 160(%rdi), %ymm5
+; AVX-NEXT: vmovaps 128(%rdi), %ymm1
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3],ymm5[0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm4[3,3],ymm1[4,4],ymm4[7,7]
+; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm3[2]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 64(%rdi), %ymm3
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],ymm3[2,2],ymm15[5,5],ymm3[6,6]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1],mem[2,2],ymm15[5,5],mem[6,6]
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT: vmovaps 32(%rdi), %xmm10
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0],xmm9[1],xmm10[2,3]
+; AVX-NEXT: vmovaps 256(%rdi), %xmm6
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0],xmm7[1],xmm6[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vmovaps 160(%rdi), %ymm4
-; AVX-NEXT: vmovaps 128(%rdi), %ymm0
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm0[2,3],ymm4[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm14[3,3],ymm0[4,4],ymm14[7,7]
-; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm8[2]
-; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm13[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT: vmovaps 384(%rdi), %ymm4
+; AVX-NEXT: vmovaps 352(%rdi), %ymm0
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm0[2,3],ymm4[0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,0],ymm12[3,3],ymm0[4,4],ymm12[7,7]
+; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12
+; AVX-NEXT: vinsertps {{.*#+}} xmm12 = zero,xmm12[1,2],xmm2[2]
+; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5,6,7]
+; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm9[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm8[1],xmm11[2,3]
+; AVX-NEXT: vmovaps 64(%rdi), %xmm12
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm6[0,3],ymm14[7,5],ymm6[4,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm5[2,1],ymm14[2,0],ymm5[6,5],ymm14[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm7[0],ymm1[2],ymm7[2]
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,0],xmm12[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm10[2,1],ymm14[2,0],ymm10[6,5],ymm14[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3]
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm7[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
+; AVX-NEXT: vmovaps 288(%rdi), %xmm11
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm13[3,0],xmm11[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm15[2,1],ymm14[2,0],ymm15[6,5],ymm14[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
+; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm3[0,3],ymm12[7,5],ymm3[4,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm15[2,1],ymm12[2,0],ymm15[6,5],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm4[0],ymm0[2],ymm4[2]
-; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3]
-; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT: vmovaps 64(%rdi), %xmm2
+; AVX-NEXT: vmovaps 96(%rdi), %xmm3
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm3[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm10[3,1],ymm2[0,2],ymm10[7,5],ymm2[4,6]
+; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 192(%rdi), %ymm9
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm9[0,1],ymm5[1,3],ymm9[4,5],ymm5[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,2],ymm10[2,0],ymm1[4,6],ymm10[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm5[0,0],ymm6[5,4],ymm5[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,1],ymm2[0,2],ymm5[7,5],ymm2[4,6]
-; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm13[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 416(%rdi), %ymm2
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,1],ymm7[1,3],ymm2[4,5],ymm7[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,2],ymm6[2,0],ymm1[4,6],ymm6[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7]
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0],ymm15[0,0],ymm3[5,4],ymm15[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm15[3,1],ymm3[0,2],ymm15[7,5],ymm3[4,6]
-; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm9[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 192(%rdi), %ymm6
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm4[1,3],ymm6[4,5],ymm4[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,2],ymm5[2,0],ymm0[4,6],ymm5[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7]
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm7[2,0],ymm2[5,4],ymm7[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4]
-; AVX-NEXT: vmovaps 320(%rdi), %xmm5
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,1,0,1]
-; AVX-NEXT: vmovaps 288(%rdi), %xmm8
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm11[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4]
-; AVX-NEXT: vmovaps 64(%rdi), %xmm11
-; AVX-NEXT: vmovaps 96(%rdi), %xmm9
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm9[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,1],ymm7[3,3],ymm2[6,5],ymm7[7,7]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX-NEXT: vmovaps 288(%rdi), %xmm10
+; AVX-NEXT: vmovaps 320(%rdi), %xmm2
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[1,0],xmm2[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm15[3,1],ymm10[0,2],ymm15[7,5],ymm10[4,6]
+; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm10[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 416(%rdi), %ymm15
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm15[0,1],ymm4[1,3],ymm15[4,5],ymm4[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,2],ymm10[2,0],ymm0[4,6],ymm10[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7]
+; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm10[0,0],ymm1[7,4],ymm10[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm9[1,0],ymm5[2,0],ymm9[5,4],ymm5[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm10[2,0],ymm1[6,4],ymm10[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm3[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm8 = mem[0],xmm8[1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm8[0,0],ymm0[7,4],ymm8[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm15[1,0],ymm4[2,0],ymm15[5,4],ymm4[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm8[2,0],ymm0[6,4],ymm8[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm2[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm9[2,1],ymm5[3,3],ymm9[6,5],ymm5[7,7]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm14[0],xmm0[1],xmm14[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,0],ymm3[2,0],ymm7[5,4],ymm3[6,4]
-; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm5[0,1,2],xmm8[3]
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0],xmm0[1],xmm1[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm5[2,0],ymm6[5,4],ymm5[6,4]
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm12[3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0],ymm12[1,0],ymm13[4,4],ymm12[5,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm14[0,0],ymm12[1,0],ymm14[4,4],ymm12[5,4]
; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1,2,3],ymm3[4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,1],ymm4[3,3],ymm6[6,5],ymm4[7,7]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0],xmm1[1],xmm3[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm4[2,0],ymm7[5,4],ymm4[6,4]
-; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm11[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm15[2,1],ymm4[3,3],ymm15[6,5],ymm4[7,7]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm5[1],xmm7[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4]
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm2[0,1,2],xmm11[3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,0],ymm11[1,0],ymm10[4,4],ymm11[5,4]
; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,0],xmm7[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX-NEXT: # xmm7 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,0],ymm12[2,0],ymm13[5,4],ymm12[6,4]
-; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm2[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm7[0,0],ymm2[7,4],ymm7[4,4]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,0],ymm7[4,5],ymm2[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm9[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4]
-; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,0],xmm5[2,3]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm6[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm7[0,0],ymm6[7,4],ymm7[4,4]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm1[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm3[1],xmm7[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX-NEXT: # xmm6 = mem[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3]
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm14[1,0],ymm12[2,0],ymm14[5,4],ymm12[6,4]
+; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,0],xmm3[2,3]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm9[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm9[3,0],ymm6[0,0],ymm9[7,4],ymm6[4,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm0[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm1[1],xmm8[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,0],ymm8[4,5],ymm6[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX-NEXT: # xmm6 = mem[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm2[3]
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4]
+; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm6[2,0],xmm2[2,3]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm15[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm15[3,0],ymm6[0,0],ymm15[7,4],ymm6[4,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm5[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm7[1],xmm8[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,0],ymm8[4,5],ymm6[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm0, (%rsi)
+; AVX-NEXT: vmovaps %ymm0, 32(%rsi)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 32(%rsi)
+; AVX-NEXT: vmovaps %ymm6, (%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm0, 32(%rdx)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, (%rdx)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm0, 32(%rdx)
+; AVX-NEXT: vmovaps %ymm0, 32(%rcx)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, (%rcx)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm0, 32(%rcx)
+; AVX-NEXT: vmovaps %ymm0, 32(%r8)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, (%r8)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm0, 32(%r8)
+; AVX-NEXT: vmovaps %ymm0, 32(%r9)
; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, (%r9)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm0, 32(%r9)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovaps %ymm4, (%rax)
-; AVX-NEXT: vmovaps %ymm15, 32(%rax)
+; AVX-NEXT: vmovaps %ymm4, 32(%rax)
+; AVX-NEXT: vmovaps %ymm13, (%rax)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovaps %ymm5, (%rax)
; AVX-NEXT: vmovaps %ymm2, 32(%rax)
+; AVX-NEXT: vmovaps %ymm3, (%rax)
; AVX-NEXT: addq $456, %rsp # imm = 0x1C8
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -4881,560 +4868,561 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX-LABEL: load_i32_stride7_vf32:
; AVX: # %bb.0:
-; AVX-NEXT: subq $1432, %rsp # imm = 0x598
+; AVX-NEXT: subq $1512, %rsp # imm = 0x5E8
; AVX-NEXT: vmovaps 480(%rdi), %ymm4
; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 448(%rdi), %ymm3
; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 544(%rdi), %ymm5
+; AVX-NEXT: vmovaps 544(%rdi), %ymm6
; AVX-NEXT: vmovaps 32(%rdi), %ymm2
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps (%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 96(%rdi), %ymm12
+; AVX-NEXT: vmovaps 96(%rdi), %ymm11
; AVX-NEXT: vmovaps 80(%rdi), %xmm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm0[0],ymm12[2],ymm0[2]
-; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
+; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmovaps (%rdi), %xmm8
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
-; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps (%rdi), %xmm13
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vmovaps 160(%rdi), %xmm2
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT: vmovaps 160(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 128(%rdi), %xmm2
+; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; AVX-NEXT: vmovaps 192(%rdi), %xmm12
+; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm12[1]
+; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 528(%rdi), %xmm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT: vmovaps 448(%rdi), %xmm8
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX-NEXT: vmovaps 608(%rdi), %xmm3
+; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 576(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 128(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX-NEXT: vmovaps 192(%rdi), %xmm7
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[1]
-; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 528(%rdi), %xmm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; AVX-NEXT: vmovaps 640(%rdi), %xmm5
+; AVX-NEXT: vinsertps {{.*#+}} xmm2 = zero,xmm2[1,2],xmm5[1]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 256(%rdi), %ymm2
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 224(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmovaps 448(%rdi), %xmm10
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
+; AVX-NEXT: vmovaps 224(%rdi), %xmm14
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
+; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vmovaps 608(%rdi), %xmm2
+; AVX-NEXT: vmovaps 320(%rdi), %ymm10
+; AVX-NEXT: vmovaps 304(%rdi), %xmm2
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm10[0],ymm2[0],ymm10[2],ymm2[2]
+; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
+; AVX-NEXT: vmovaps 384(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 576(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX-NEXT: vmovaps 640(%rdi), %xmm9
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[1]
+; AVX-NEXT: vmovaps 352(%rdi), %xmm3
+; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm2[1]
+; AVX-NEXT: vmovaps 416(%rdi), %xmm9
+; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm9[1]
; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 256(%rdi), %ymm1
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 704(%rdi), %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 672(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 224(%rdi), %ymm0
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX-NEXT: vmovaps 672(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; AVX-NEXT: vmovaps 768(%rdi), %ymm15
+; AVX-NEXT: vmovaps 752(%rdi), %xmm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovaps 224(%rdi), %xmm11
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1],xmm0[2,3]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm15[0],ymm0[0],ymm15[2],ymm0[2]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT: vmovaps 832(%rdi), %xmm4
+; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 800(%rdi), %xmm3
+; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm4[1]
+; AVX-NEXT: vmovaps 864(%rdi), %xmm2
+; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm2[1]
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1],mem[2,2],ymm11[5,5],mem[6,6]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX-NEXT: vmovaps 32(%rdi), %xmm11
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0],xmm13[1],xmm11[2,3]
+; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT: vmovaps 320(%rdi), %ymm4
-; AVX-NEXT: vmovaps 304(%rdi), %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
+; AVX-NEXT: vmovaps 160(%rdi), %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 128(%rdi), %ymm2
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3],ymm3[0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,0],ymm3[3,3],ymm2[4,4],ymm3[7,7]
+; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3
+; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[2]
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT: vmovaps 384(%rdi), %xmm1
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1],mem[2,2],ymm6[5,5],mem[6,6]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX-NEXT: vmovaps 480(%rdi), %xmm7
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0],xmm8[1],xmm7[2,3]
+; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm8, %xmm2
+; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3,4,5,6,7]
+; AVX-NEXT: vmovaps 608(%rdi), %ymm12
+; AVX-NEXT: vmovaps 576(%rdi), %ymm6
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm6[2,3],ymm12[0,1]
+; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,0],ymm4[3,3],ymm6[4,4],ymm4[7,7]
+; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm5[2]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm10[1,1],mem[2,2],ymm10[5,5],mem[6,6]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT: vmovaps 256(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 352(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX-NEXT: vmovaps 416(%rdi), %xmm3
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm3[1]
-; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 704(%rdi), %ymm1
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm14[1],xmm1[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT: vmovaps 384(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 672(%rdi), %ymm0
+; AVX-NEXT: vmovaps 352(%rdi), %ymm4
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm4[2,3],ymm1[0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0],ymm8[3,3],ymm4[4,4],ymm8[7,7]
+; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8
+; AVX-NEXT: vinsertps {{.*#+}} xmm8 = zero,xmm8[1,2],xmm9[2]
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm8[5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm15[1,1],mem[2,2],ymm15[5,5],mem[6,6]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT: vmovaps 704(%rdi), %xmm1
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm1[0],xmm14[1],xmm1[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],mem[3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT: vmovaps 832(%rdi), %ymm10
+; AVX-NEXT: vmovaps 800(%rdi), %ymm3
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3],ymm10[0,1]
+; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm3[0,0],ymm9[3,3],ymm3[4,4],ymm9[7,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm9[1,2],xmm4[2]
+; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7]
+; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm13[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3]
+; AVX-NEXT: vmovaps 64(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,0],xmm0[0,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm0[2,1],ymm9[2,0],ymm0[6,5],ymm9[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm11[0],ymm13[0],ymm11[2],ymm13[2]
+; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm9[0,1,2],mem[3]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovaps 672(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT: vmovaps 768(%rdi), %ymm14
-; AVX-NEXT: vmovaps 752(%rdi), %xmm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT: vmovaps 832(%rdi), %xmm2
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
+; AVX-NEXT: vmovaps 512(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 800(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
-; AVX-NEXT: vmovaps 864(%rdi), %xmm6
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm6[1]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 64(%rdi), %ymm0
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,0],xmm2[0,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm2[2,1],ymm8[2,0],ymm2[6,5],ymm8[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm6[0],ymm12[0],ymm6[2],ymm12[2]
+; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1],ymm0[2,2],ymm12[5,5],ymm0[6,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT: vmovaps 32(%rdi), %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX-NEXT: vmovaps %xmm1, %xmm14
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vmovaps 160(%rdi), %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 128(%rdi), %ymm15
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,0],ymm1[3,3],ymm15[4,4],ymm1[7,7]
-; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm7[2]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 512(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6]
-; AVX-NEXT: vmovaps %ymm5, %ymm7
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT: vmovaps 480(%rdi), %xmm1
+; AVX-NEXT: vmovaps 736(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0],xmm10[1],xmm1[2,3]
-; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vmovaps 608(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 576(%rdi), %ymm12
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm0[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm12[0,0],ymm5[3,3],ymm12[4,4],ymm5[7,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm1[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm15[2,1],ymm5[2,0],ymm15[6,5],ymm5[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm3[0],ymm10[0],ymm3[2],ymm10[2]
; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm9[2]
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm4[3]
; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 288(%rdi), %ymm0
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3]
-; AVX-NEXT: vmovaps 256(%rdi), %xmm0
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0],xmm11[1],xmm0[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3,4,5,6,7]
-; AVX-NEXT: vmovaps 384(%rdi), %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 352(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm5[3,3],ymm0[4,4],ymm5[7,7]
-; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm3[2]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 736(%rdi), %ymm5
-; AVX-NEXT: vmovaps %ymm14, %ymm3
-; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1],ymm5[2,2],ymm14[5,5],ymm5[6,6]
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3]
-; AVX-NEXT: vmovaps 704(%rdi), %xmm4
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm2[3,4,5,6,7]
-; AVX-NEXT: vmovaps 832(%rdi), %ymm13
-; AVX-NEXT: vmovaps 800(%rdi), %ymm2
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],ymm13[0,1]
-; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,0],ymm14[3,3],ymm2[4,4],ymm14[7,7]
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm6[2]
-; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
-; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX-NEXT: # xmm11 = mem[2,3,2,3]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm0[1],xmm11[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm9[0,3],ymm14[7,5],ymm9[4,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm8[2,1],ymm14[2,0],ymm8[6,5],ymm14[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7]
-; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT: # ymm14 = ymm15[0],mem[0],ymm15[2],mem[2]
-; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
-; AVX-NEXT: # xmm14 = xmm14[0,1,2],mem[3]
-; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
-; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm10[2,3,2,3]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm10[0,3],ymm14[7,5],ymm10[4,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm7[2,1],ymm14[2,0],ymm7[6,5],ymm14[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7]
-; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT: # ymm14 = ymm12[0],mem[0],ymm12[2],mem[2]
-; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
-; AVX-NEXT: # xmm14 = xmm14[0,1,2],mem[3]
-; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7]
-; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm1[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm4[1],xmm11[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm3[2,1],ymm14[2,0],ymm3[6,5],ymm14[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm13[0],ymm2[2],ymm13[2]
-; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm6[3]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3,4],ymm4[5,6,7]
-; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,3,2,3]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,1],ymm3[0,3],ymm11[7,5],ymm3[4,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm2[2,1],ymm11[2,0],ymm2[6,5],ymm11[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,3,2,3]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3]
+; AVX-NEXT: vmovaps 288(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm13[0],ymm5[0],ymm13[2],ymm5[2]
-; AVX-NEXT: vextractf128 $1, %ymm11, %xmm11
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT: # xmm11 = xmm11[0,1,2],mem[3]
-; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm8[0,0],ymm9[5,4],ymm8[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,1],ymm4[0,2],ymm8[7,5],ymm4[4,6]
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
-; AVX-NEXT: # xmm9 = xmm0[0,1,2],mem[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 192(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1],ymm14[1,3],ymm0[4,5],ymm14[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,2],ymm11[2,0],ymm0[4,6],ymm11[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm10[1,0],ymm7[0,0],ymm10[5,4],ymm7[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[3,1],ymm4[0,2],ymm7[7,5],ymm4[4,6]
-; AVX-NEXT: vmovaps %xmm15, %xmm10
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = xmm15[0,1,2],mem[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 640(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm1[0,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1],ymm4[1,3],ymm0[4,5],ymm4[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[2,0],ymm12[4,6],ymm11[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm3[1,0],ymm2[0,0],ymm3[5,4],ymm2[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1],ymm7[0,2],ymm2[7,5],ymm7[4,6]
-; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm1[0,1,2],xmm6[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 416(%rdi), %ymm15
-; AVX-NEXT: vmovaps %ymm5, %ymm8
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm15[0,1],ymm5[1,3],ymm15[4,5],ymm5[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm13[0,2],ymm11[2,0],ymm13[4,6],ymm11[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm11[5,6,7]
-; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm5[0,2],ymm0[7,5],ymm5[4,6]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT: # xmm6 = mem[0,1,2],xmm0[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 864(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,1],ymm9[1,3],ymm0[4,5],ymm9[5,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm0[0,2],ymm7[2,0],ymm0[4,6],ymm7[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,1],ymm5[2,0],ymm4[6,5],ymm5[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm8[0],ymm12[0],ymm8[2],ymm12[2]
+; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm5[0,1,2],mem[3]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm12[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm6[0,0],ymm12[7,4],ymm6[4,4]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0],ymm4[2,0],ymm5[5,4],ymm4[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[2,0],ymm0[6,4],ymm6[6,4]
-; AVX-NEXT: vmovaps 544(%rdi), %xmm1
+; AVX-NEXT: vmovaps 64(%rdi), %xmm0
+; AVX-NEXT: vmovaps 96(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm1[0,1,0,1]
-; AVX-NEXT: vmovaps 512(%rdi), %xmm7
-; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm7[0,1,2],xmm11[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm10[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = mem[0],xmm1[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm1[0,1,2],mem[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 192(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1],ymm13[1,3],ymm1[4,5],ymm13[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,2],ymm2[2,0],ymm11[4,6],ymm2[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 512(%rdi), %xmm0
+; AVX-NEXT: vmovaps 544(%rdi), %xmm11
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0]
+; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm14[2,0],ymm10[5,4],ymm14[6,4]
-; AVX-NEXT: vmovaps %ymm14, %ymm6
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
-; AVX-NEXT: vmovaps 64(%rdi), %xmm1
-; AVX-NEXT: vmovaps 96(%rdi), %xmm4
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,1,0,1]
-; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; AVX-NEXT: # xmm14 = mem[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm10[0,1,2],mem[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 640(%rdi), %ymm3
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,1],ymm2[1,3],ymm3[4,5],ymm2[5,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[2,0],ymm6[4,6],ymm5[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm0[0,0],ymm13[7,4],ymm0[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm15[1,0],ymm8[2,0],ymm15[5,4],ymm8[6,4]
-; AVX-NEXT: vmovaps %ymm15, %ymm11
-; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4]
-; AVX-NEXT: vmovaps 320(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm2[0,1,0,1]
-; AVX-NEXT: vmovaps 288(%rdi), %xmm3
-; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX-NEXT: # xmm15 = mem[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm15 = mem[0],xmm15[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovaps 288(%rdi), %xmm0
+; AVX-NEXT: vmovaps 320(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[3,1],ymm0[0,2],ymm4[7,5],ymm0[4,6]
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm7[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 416(%rdi), %ymm1
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,1],ymm12[1,3],ymm1[4,5],ymm12[5,7]
+; AVX-NEXT: vmovaps %ymm12, %ymm7
+; AVX-NEXT: vmovaps %ymm1, %ymm12
+; AVX-NEXT: vmovaps %ymm8, %ymm4
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,2],ymm5[2,0],ymm8[4,6],ymm5[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4]
+; AVX-NEXT: vmovaps 736(%rdi), %xmm0
+; AVX-NEXT: vmovaps 768(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm15[3,1],ymm0[0,2],ymm15[7,5],ymm0[4,6]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = xmm14[0,1,2],mem[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 864(%rdi), %ymm13
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm13[0,1],ymm9[1,3],ymm13[4,5],ymm9[5,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,0],ymm2[2,0],ymm0[6,4],ymm2[6,4]
-; AVX-NEXT: vmovaps 768(%rdi), %xmm0
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,1,0,1]
-; AVX-NEXT: vmovaps 736(%rdi), %xmm2
-; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm2[0,1,2],xmm15[3]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; AVX-NEXT: # xmm13 = mem[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm13 = mem[0],xmm13[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm14[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm8[0,2],ymm14[2,0],ymm8[4,6],ymm14[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm14[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm10[2,1],ymm6[3,3],ymm10[6,5],ymm6[7,7]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload
-; AVX-NEXT: # xmm14 = xmm0[0],mem[1],xmm0[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT: # ymm14 = ymm0[0,0],mem[1,0],ymm0[4,4],mem[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm14[2,0],xmm1[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm6[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm14[0,0],ymm6[7,4],ymm14[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,0],ymm2[2,0],ymm3[5,4],ymm2[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm14[2,0],ymm1[6,4],ymm14[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm11[0,1,0,1]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1,2],xmm14[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm10[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = ymm5[2,1],mem[3,3],ymm5[6,5],mem[7,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,0],ymm11[0,0],ymm0[7,4],ymm11[4,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm11[2,0],ymm6[6,4],ymm11[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm5[0,1,0,1]
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm4[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0],ymm11[0,0],ymm4[7,4],ymm11[4,4]
+; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm12[1,0],ymm7[2,0],ymm12[5,4],ymm7[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm11[2,0],ymm4[6,4],ymm11[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm7[0,1,0,1]
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0],xmm10[1],xmm15[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm4[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm8[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,0],ymm11[0,0],ymm8[7,4],ymm11[4,4]
+; AVX-NEXT: vmovaps %ymm9, %ymm4
+; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm13[1,0],ymm9[2,0],ymm13[5,4],ymm9[6,4]
+; AVX-NEXT: vmovaps %ymm13, %ymm9
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm11[2,0],ymm3[6,4],ymm11[6,4]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = mem[0,1,0,1]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm8[0,1,2],xmm11[3]
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm1[2,1],ymm0[3,3],ymm1[6,5],ymm0[7,7]
+; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[0],xmm1[1],mem[2,3]
; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm14[1,0],ymm11[2,0],ymm14[5,4],ymm11[6,4]
+; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm5[0,1,2],xmm15[3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[0,0],ymm4[1,0],ymm0[4,4],ymm4[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm14[2,0],xmm7[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm13[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload
-; AVX-NEXT: # xmm14 = mem[0],xmm7[1],mem[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
-; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm14[1,0],ymm13[2,0],ymm14[5,4],ymm13[6,4]
-; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[0,1,2],xmm3[3]
+; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT: # ymm15 = ymm0[0,0],mem[1,0],ymm0[4,4],mem[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,0],xmm14[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload
-; AVX-NEXT: # ymm14 = ymm11[0,0],mem[1,0],ymm11[4,4],mem[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm14[2,0],xmm3[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm3[0,1,2,3],ymm13[4,5,6,7]
-; AVX-NEXT: vmovaps %ymm8, %ymm7
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm8[2,1],ymm9[3,3],ymm8[6,5],ymm9[7,7]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0],xmm8[1],xmm9[2,3]
+; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT: # ymm14 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = mem[0],xmm13[1],mem[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,0],ymm14[2,0],ymm15[5,4],ymm14[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm6[0,1,2],xmm2[3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm5[1,0],ymm2[4,4],ymm5[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm15[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm14[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT: # ymm1 = ymm12[2,1],mem[3,3],ymm12[6,5],mem[7,7]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
+; AVX-NEXT: # xmm12 = mem[0],xmm0[1],mem[2,3]
; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm12[1,0],ymm3[2,0],ymm12[5,4],ymm3[6,4]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm1[2,0],ymm12[5,4],ymm1[6,4]
+; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm7[0,1,2],xmm10[3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm1[0,0],ymm14[1,0],ymm1[4,4],ymm14[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,0],xmm2[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm6[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4]
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,3]
+; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT: # ymm15 = ymm14[0,0],mem[1,0],ymm14[4,4],mem[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
+; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm15[2,0],xmm12[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[2,1],ymm4[3,3],ymm9[6,5],ymm4[7,7]
+; AVX-NEXT: vmovaps %ymm9, %ymm12
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0],xmm7[1],xmm9[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm1[2,0],ymm10[5,4],ymm1[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm4[0,1,2],xmm8[3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm15[1],xmm4[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm5[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm14[2,0],ymm1[5,4],ymm14[6,4]
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,0],xmm0[2,3]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm7[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,0],ymm2[0,0],ymm7[7,4],ymm2[4,4]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,0],ymm4[4,5],ymm2[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = mem[0,1,0,1]
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = xmm2[0,1,2],mem[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT: # ymm4 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4]
-; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,0],xmm2[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,0],ymm4[0,0],ymm1[7,4],ymm4[4,4]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = mem[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3]
+; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm0[0,0],ymm3[1,0],ymm0[4,4],ymm3[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm15[2,0],xmm10[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = mem[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm6[3]
+; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm2[1,0],ymm5[2,0],ymm2[5,4],ymm5[6,4]
+; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,0],xmm1[2,3]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm11[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm11[3,0],ymm8[0,0],ymm11[7,4],ymm8[4,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm13[2,3,2,3]
+; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,0],ymm8[4,5],ymm2[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = mem[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm3[2,0],ymm0[5,4],ymm3[6,4]
+; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,0],xmm1[2,3]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm12[3,0],ymm5[0,0],ymm12[7,4],ymm5[4,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm9[1],xmm5[2,3]
; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,0],ymm5[4,5],ymm4[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = mem[0,1,0,1]
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = xmm4[0,1,2],mem[3]
-; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT: # ymm5 = ymm11[1,0],mem[2,0],ymm11[5,4],mem[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,0],ymm5[4,5],ymm0[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = mem[0,1,0,1]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm1[0,1,2],mem[3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4]
; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,0],ymm5[0,0],ymm1[7,4],ymm5[4,4]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX-NEXT: # xmm6 = mem[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,0],ymm6[4,5],ymm5[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 96(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 32(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm5, 64(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm5, (%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 96(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 32(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 64(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, (%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 96(%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 64(%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, (%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 96(%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 32(%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 64(%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, (%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 96(%r9)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 32(%r9)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, (%r9)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 64(%r9)
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,0],xmm1[2,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm3[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm3[3,0],ymm5[0,0],ymm3[7,4],ymm5[4,4]
+; AVX-NEXT: vpermilps $238, (%rsp), %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = mem[2,3,2,3]
+; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,0],ymm8[4,5],ymm5[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = mem[0,1,0,1]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm1[0,1,2],mem[3]
+; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload
+; AVX-NEXT: # ymm8 = ymm14[1,0],mem[2,0],ymm14[5,4],mem[6,4]
+; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,0],xmm1[2,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm3[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm3[3,0],ymm8[0,0],ymm3[7,4],ymm8[4,4]
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = mem[2,3,2,3]
+; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,0],ymm8[4,5],ymm7[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 96(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 32(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 64(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm7, (%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 96(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 32(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 64(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, (%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 32(%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 96(%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 64(%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, (%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 96(%r8)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 32(%r8)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 64(%r8)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, (%r8)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 96(%r9)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 32(%r9)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, (%r9)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 64(%r9)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovaps %ymm12, 96(%rax)
-; AVX-NEXT: vmovaps %ymm13, 32(%rax)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 64(%rax)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, (%rax)
+; AVX-NEXT: vmovaps %ymm10, 96(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 32(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 64(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, (%rax)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovaps %ymm4, 32(%rax)
-; AVX-NEXT: vmovaps %ymm2, (%rax)
+; AVX-NEXT: vmovaps %ymm1, 32(%rax)
+; AVX-NEXT: vmovaps %ymm5, (%rax)
; AVX-NEXT: vmovaps %ymm0, 96(%rax)
-; AVX-NEXT: vmovaps %ymm3, 64(%rax)
-; AVX-NEXT: addq $1432, %rsp # imm = 0x598
+; AVX-NEXT: vmovaps %ymm2, 64(%rax)
+; AVX-NEXT: addq $1512, %rsp # imm = 0x5E8
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -9927,25 +9915,27 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX-LABEL: load_i32_stride7_vf64:
; AVX: # %bb.0:
-; AVX-NEXT: subq $3176, %rsp # imm = 0xC68
+; AVX-NEXT: subq $3224, %rsp # imm = 0xC98
; AVX-NEXT: vmovaps 704(%rdi), %ymm2
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 672(%rdi), %ymm3
; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 768(%rdi), %ymm11
+; AVX-NEXT: vmovaps 768(%rdi), %ymm5
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 256(%rdi), %ymm4
; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 224(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 320(%rdi), %ymm6
+; AVX-NEXT: vmovaps 320(%rdi), %ymm8
; AVX-NEXT: vmovaps 304(%rdi), %xmm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[2],ymm0[2]
-; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],ymm0[0],ymm8[2],ymm0[2]
+; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6],ymm1[7]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmovaps 224(%rdi), %xmm10
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
+; AVX-NEXT: vmovaps 224(%rdi), %xmm4
+; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX-NEXT: vmovaps 384(%rdi), %xmm1
@@ -9961,12 +9951,12 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 752(%rdi), %xmm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],ymm0[0],ymm11[2],ymm0[2]
-; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],ymm0[0],ymm5[2],ymm0[2]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmovaps 672(%rdi), %xmm15
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
+; AVX-NEXT: vmovaps 672(%rdi), %xmm7
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
+; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
; AVX-NEXT: vmovaps 832(%rdi), %xmm1
@@ -9974,8 +9964,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps 800(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX-NEXT: vmovaps 864(%rdi), %xmm13
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[1]
+; AVX-NEXT: vmovaps 864(%rdi), %xmm2
+; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10016,11 +10007,11 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT: vmovaps 1664(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 1664(%rdi), %ymm15
; AVX-NEXT: vmovaps 1648(%rdi), %xmm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm15[0],ymm1[0],ymm15[2],ymm1[2]
+; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX-NEXT: vmovaps 1728(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -10039,24 +10030,23 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovaps (%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vmovaps (%rdi), %xmm14
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT: vmovaps 96(%rdi), %ymm14
+; AVX-NEXT: vmovaps 96(%rdi), %ymm5
; AVX-NEXT: vmovaps 80(%rdi), %xmm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm1[0],ymm14[2],ymm1[2]
-; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2]
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX-NEXT: vmovaps 160(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 128(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX-NEXT: vmovaps 192(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
+; AVX-NEXT: vmovaps 192(%rdi), %xmm13
+; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[1]
+; AVX-NEXT: vmovaps %xmm13, (%rsp) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10066,9 +10056,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovaps 448(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX-NEXT: vmovaps 448(%rdi), %xmm6
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3]
+; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
; AVX-NEXT: vmovaps 544(%rdi), %ymm2
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10081,9 +10071,9 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps 576(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX-NEXT: vmovaps 640(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
+; AVX-NEXT: vmovaps 640(%rdi), %xmm10
+; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm10[1]
+; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10093,732 +10083,663 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmovaps 896(%rdi), %xmm12
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3]
+; AVX-NEXT: vmovaps 896(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT: vmovaps 992(%rdi), %ymm5
+; AVX-NEXT: vmovaps 992(%rdi), %ymm2
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 976(%rdi), %xmm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm1[0],ymm5[2],ymm1[2]
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX-NEXT: vmovaps 1056(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 1024(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX-NEXT: vmovaps 1088(%rdi), %xmm8
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm8[1]
-; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm2[1],xmm1[1]
+; AVX-NEXT: vmovaps 1088(%rdi), %xmm12
+; AVX-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm3[1,2],xmm12[1]
+; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1376(%rdi), %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 1376(%rdi), %ymm2
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 1344(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmovaps 1344(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3]
-; AVX-NEXT: vmovaps 1440(%rdi), %ymm4
-; AVX-NEXT: vmovaps 1424(%rdi), %xmm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm4[0],ymm1[0],ymm4[2],ymm1[2]
-; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
-; AVX-NEXT: vmovaps 1504(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 1440(%rdi), %ymm11
+; AVX-NEXT: vmovaps 1424(%rdi), %xmm2
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm11[0],ymm2[0],ymm11[2],ymm2[2]
+; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT: vmovaps 1504(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps 1472(%rdi), %xmm2
; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX-NEXT: vmovaps 1536(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm2[1]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 288(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm0[2,2],ymm6[5,5],ymm0[6,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT: vmovaps 256(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm10[1],xmm1[2,3]
-; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm0[1]
+; AVX-NEXT: vmovaps 1536(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm4[1,2],xmm0[1]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm8[1,1],mem[2,2],ymm8[5,5],mem[6,6]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT: vmovaps 256(%rdi), %xmm8
+; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = xmm8[0],mem[1],xmm8[2,3]
+; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX-NEXT: vmovaps 384(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 352(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7]
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = zero,xmm1[1,2],mem[0]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 736(%rdi), %ymm0
+; AVX-NEXT: vmovaps 352(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1],ymm0[2,2],ymm11[5,5],ymm0[6,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT: vmovaps 704(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3]
-; AVX-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0],ymm4[3,3],ymm0[4,4],ymm4[7,7]
+; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = zero,xmm4[1,2],mem[0]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1],mem[2,2],ymm0[5,5],mem[6,6]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT: vmovaps 704(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0],xmm7[1],xmm0[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX-NEXT: vmovaps 832(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 800(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7]
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm13[2]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT: vmovaps 800(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1184(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm9[1,1],ymm0[2,2],ymm9[5,5],ymm0[6,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT: vmovaps 1152(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm11[1],xmm1[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vmovaps 1280(%rdi), %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1248(%rdi), %ymm2
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0],ymm4[3,3],ymm0[4,4],ymm4[7,7]
+; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = zero,xmm4[1,2],mem[0]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7]
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,1],mem[2,2],ymm9[5,5],mem[6,6]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT: vmovaps 1152(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1,2],xmm9[2]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1632(%rdi), %ymm0
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0],xmm9[1],xmm0[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT: vmovaps 1280(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 1248(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT: vmovaps 1600(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0],ymm4[3,3],ymm0[4,4],ymm4[7,7]
+; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = zero,xmm4[1,2],mem[0]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm15[1,1],mem[2,2],ymm15[5,5],mem[6,6]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT: vmovaps 1600(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0],xmm7[1],xmm0[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
; AVX-NEXT: vmovaps 1728(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1696(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,3],ymm2[4,4],ymm1[7,7]
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = zero,xmm1[1,2],mem[0]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 64(%rdi), %ymm0
+; AVX-NEXT: vmovaps 1696(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1],ymm0[2,2],ymm14[5,5],ymm0[6,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3,4,5,6,7]
-; AVX-NEXT: vmovaps 160(%rdi), %ymm2
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0],ymm4[3,3],ymm0[4,4],ymm4[7,7]
+; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4
+; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = zero,xmm4[1,2],mem[0]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm4[5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 128(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm2[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0],ymm2[3,3],ymm0[4,4],ymm2[7,7]
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = zero,xmm2[1,2],mem[0]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm5[1,1],mem[2,2],ymm5[5,5],mem[6,6]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX-NEXT: vmovaps 32(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0],xmm14[1],xmm0[2,3]
+; AVX-NEXT: vmovaps %xmm14, %xmm15
+; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],mem[3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX-NEXT: vmovaps 160(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 512(%rdi), %ymm0
+; AVX-NEXT: vmovaps 128(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm0[2,2],ymm1[5,5],ymm0[6,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm5[3,3],ymm0[4,4],ymm5[7,7]
+; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm13[2]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm5[5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1],mem[2,2],ymm13[5,5],mem[6,6]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3]
; AVX-NEXT: vmovaps 480(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0],xmm14[1],xmm0[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0],xmm6[1],xmm0[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0],mem[3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm4[3,4,5,6,7]
; AVX-NEXT: vmovaps 608(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 576(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0],ymm3[3,3],ymm1[4,4],ymm3[7,7]
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = zero,xmm3[1,2],mem[0]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[0,0],ymm6[3,3],ymm1[4,4],ymm6[7,7]
+; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6
+; AVX-NEXT: vinsertps {{.*#+}} xmm6 = zero,xmm6[1,2],xmm10[2]
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm6[5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 960(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,1],ymm0[2,2],ymm5[5,5],ymm0[6,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm10[1,1],mem[2,2],ymm10[5,5],mem[6,6]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
; AVX-NEXT: vmovaps 928(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm12[1],xmm0[2,3]
-; AVX-NEXT: vmovaps %xmm12, %xmm6
-; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0],mem[3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm5[3,4,5,6,7]
; AVX-NEXT: vmovaps 1056(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1024(%rdi), %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[2,3],ymm0[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm1[0,0],ymm5[3,3],ymm1[4,4],ymm5[7,7]
-; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1,2],xmm8[2]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7]
+; AVX-NEXT: vmovaps 1024(%rdi), %ymm2
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1408(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,1],ymm0[2,2],ymm4[5,5],ymm0[6,6]
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3]
-; AVX-NEXT: vmovaps 1376(%rdi), %xmm4
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm2[2,3],ymm0[0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,0],ymm14[3,3],ymm2[4,4],ymm14[7,7]
+; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm12[2]
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm14[5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,1],mem[2,2],ymm11[5,5],mem[6,6]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3]
+; AVX-NEXT: vmovaps 1376(%rdi), %xmm5
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm4[0],xmm3[1],xmm4[2,3]
-; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,0],mem[3,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm5[3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm5[0],xmm3[1],xmm5[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm14[1,0],mem[3,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2],ymm6[3,4,5,6,7]
; AVX-NEXT: vmovaps 1504(%rdi), %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1472(%rdi), %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm1[2,3],ymm0[0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm1[0,0],ymm8[3,3],ymm1[4,4],ymm8[7,7]
-; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8
-; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = zero,xmm8[1,2],mem[0]
-; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5,6,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm10[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm10[0],ymm0[2],ymm10[2]
-; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm15[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm13[3]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm11[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1,2],xmm9[3]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm7[2,3,2,3]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm13[1],xmm8[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm15[0],ymm11[0],ymm15[2],ymm11[2]
-; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = mem[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
+; AVX-NEXT: vmovaps 1472(%rdi), %ymm4
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm4[2,3],ymm0[0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0],ymm2[3,3],ymm4[4,4],ymm2[7,7]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT: vinsertps $49, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = zero,xmm2[1,2],mem[0]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = mem[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
+; AVX-NEXT: vmovaps 288(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,0],xmm0[0,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm14[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
+; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT: # ymm14 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm14[0,1,2],mem[3]
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = mem[2,3,2,3]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
+; AVX-NEXT: vmovaps 736(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,0],xmm0[0,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,1],ymm12[2,0],ymm0[6,5],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm12[3,1],mem[0,3],ymm12[7,5],mem[4,7]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm6[0],ymm0[2],ymm6[2]
+; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm14[0,1,2],mem[3]
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm12[1],xmm2[2,3]
+; AVX-NEXT: vmovaps 1184(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm14[2,1],ymm12[2,0],ymm14[6,5],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,0],xmm0[0,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps %xmm3, %xmm9
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3]
-; AVX-NEXT: vmovaps %xmm4, %xmm6
-; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm5[0,3],ymm12[7,5],ymm5[4,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm4[2,1],ymm12[2,0],ymm4[6,5],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm7[0],ymm3[0],ymm7[2],ymm3[2]
-; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = xmm12[0,1,2],mem[3]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT: # ymm14 = ymm9[0],mem[0],ymm9[2],mem[2]
+; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm14[0,1,2],mem[3]
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,3,2,3]
+; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
+; AVX-NEXT: vmovaps 1632(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,0],xmm0[0,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm12[0,0],ymm0[5,4],ymm12[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,1],ymm8[0,2],ymm12[7,5],ymm8[4,6]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = mem[0,1,2],xmm0[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 416(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm10[1,3],ymm0[4,5],ymm10[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm0[2,1],ymm14[2,0],ymm0[6,5],ymm14[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm0[0],ymm11[0],ymm0[2],ymm11[2]
+; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT: # xmm7 = xmm14[0,1,2],mem[3]
+; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,3,2,3]
+; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
+; AVX-NEXT: vmovaps 64(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,0],xmm0[0,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm12[0,0],ymm0[5,4],ymm12[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,1],ymm8[0,2],ymm12[7,5],ymm8[4,6]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = mem[0,1,2],xmm0[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 864(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm2[1,3],ymm0[4,5],ymm2[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm0[2,1],ymm7[2,0],ymm0[6,5],ymm7[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm12[5,6,7]
+; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT: # ymm7 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7
+; AVX-NEXT: vblendps $8, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT: # xmm7 = xmm7[0,1,2],mem[3]
+; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm2[0,0],ymm0[5,4],ymm2[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm2[3,1],ymm8[0,2],ymm2[7,5],ymm8[4,6]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = mem[0,1,2],xmm0[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 1312(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm1[1,3],ymm0[4,5],ymm1[5,7]
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = mem[2,3,2,3]
+; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
+; AVX-NEXT: vmovaps 512(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,0],xmm0[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm13[2,1],ymm7[2,0],ymm13[6,5],ymm7[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5,6,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,2],ymm12[2,0],ymm0[4,6],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm12[5,6,7]
+; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX-NEXT: # ymm7 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT: # xmm7 = xmm7[0,1,2],mem[3]
+; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm14[1],xmm2[2,3]
+; AVX-NEXT: vmovaps 960(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,0],xmm0[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm10[2,1],ymm7[2,0],ymm10[6,5],ymm7[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm4[0],ymm15[0],ymm4[2],ymm15[2]
+; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm7[0,1,2],mem[3]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm0[1,0],ymm1[0,0],ymm0[5,4],ymm1[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm1[3,1],ymm8[0,2],ymm1[7,5],ymm8[4,6]
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = xmm13[0,1,2],mem[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 1760(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[0,1],ymm11[1,3],ymm0[4,5],ymm11[5,7]
-; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm15[0,2],ymm12[2,0],ymm15[4,6],ymm12[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm12[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm4[0,0],ymm5[5,4],ymm4[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[3,1],ymm2[0,2],ymm4[7,5],ymm2[4,6]
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm9[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 1536(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1],ymm3[1,3],ymm0[4,5],ymm3[5,7]
-; AVX-NEXT: vmovaps %ymm3, %ymm15
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,2],ymm4[2,0],ymm7[4,6],ymm4[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm14[0,0],ymm1[5,4],ymm14[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,1],ymm2[0,2],ymm14[7,5],ymm2[4,6]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = xmm12[0,1,2],mem[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 1088(%rdi), %ymm11
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm11[0,1],ymm10[1,3],ymm11[4,5],ymm10[5,7]
-; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,2],ymm4[2,0],ymm14[4,6],ymm4[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3]
+; AVX-NEXT: vmovaps %xmm5, %xmm7
+; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 1408(%rdi), %xmm2
+; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = xmm9[0,1,2],mem[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 640(%rdi), %ymm8
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,0],xmm2[0,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm5[1,3],ymm8[4,5],ymm5[5,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[2,1],ymm2[2,0],ymm5[6,5],ymm2[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,2],ymm4[2,0],ymm3[4,6],ymm4[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm4[5,6,7]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm3[0],ymm13[0],ymm3[2],ymm13[2]
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm2[0,1,2],mem[3]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[1,0],ymm0[0,0],ymm1[5,4],ymm0[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm0[3,1],ymm2[0,2],ymm0[7,5],ymm2[4,6]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = xmm6[0,1,2],mem[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7]
-; AVX-NEXT: vmovaps 192(%rdi), %ymm13
-; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 288(%rdi), %xmm0
+; AVX-NEXT: vmovaps 320(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm13[0,1],ymm1[1,3],ymm13[4,5],ymm1[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 416(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm10[1,3],ymm1[4,5],ymm10[5,7]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm2[0,2],ymm7[2,0],ymm2[4,6],ymm7[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm7[5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm4[0,0],ymm2[7,4],ymm4[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm1[2,0],ymm13[5,4],ymm1[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4]
-; AVX-NEXT: vmovaps 64(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 96(%rdi), %xmm1
+; AVX-NEXT: vmovaps 736(%rdi), %xmm0
+; AVX-NEXT: vmovaps 768(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm7[2,0],ymm13[5,4],ymm7[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4]
-; AVX-NEXT: vmovaps 320(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1]
-; AVX-NEXT: vmovaps 288(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; AVX-NEXT: # xmm6 = mem[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm8[0,1,2],mem[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 864(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm6[1,3],ymm1[4,5],ymm6[5,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm3[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[3,0],ymm0[0,0],ymm3[7,4],ymm0[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm5[2,0],ymm8[5,4],ymm5[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
-; AVX-NEXT: vmovaps 544(%rdi), %xmm1
+; AVX-NEXT: vmovaps 1184(%rdi), %xmm0
+; AVX-NEXT: vmovaps 1216(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX-NEXT: vmovaps 512(%rdi), %xmm6
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm12[0,1,2],mem[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 1312(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT: # ymm1 = ymm1[0,1],mem[1,3],ymm1[4,5],mem[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm1[2,0],ymm9[4,6],ymm1[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 1632(%rdi), %xmm0
+; AVX-NEXT: vmovaps 1664(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT: # ymm1 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
-; AVX-NEXT: vmovaps 768(%rdi), %xmm1
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 1760(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm11[1,3],ymm1[4,5],ymm11[5,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,0],ymm2[4,6],ymm1[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 1408(%rdi), %xmm0
+; AVX-NEXT: vmovaps 1440(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX-NEXT: vmovaps 736(%rdi), %xmm4
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX-NEXT: # xmm9 = mem[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[3,1],ymm0[0,2],ymm5[7,5],ymm0[4,6]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm7[0,1,2],mem[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 1536(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm13[1,3],ymm1[4,5],ymm13[5,7]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm1[2,0],ymm3[4,6],ymm1[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm14[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[3,0],ymm0[0,0],ymm14[7,4],ymm0[4,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,0],ymm10[2,0],ymm11[5,4],ymm10[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
+; AVX-NEXT: vmovaps 960(%rdi), %xmm0
; AVX-NEXT: vmovaps 992(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX-NEXT: vmovaps 960(%rdi), %xmm3
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT: vmovaps %xmm14, %xmm12
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm14[0,1,2],mem[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 1088(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1],ymm15[1,3],ymm1[4,5],ymm15[5,7]
+; AVX-NEXT: vmovaps %ymm15, %ymm11
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2],ymm1[2,0],ymm4[4,6],ymm1[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 512(%rdi), %xmm0
+; AVX-NEXT: vmovaps 544(%rdi), %xmm8
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0]
+; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT: # ymm1 = ymm11[1,0],mem[2,0],ymm11[5,4],mem[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
-; AVX-NEXT: vmovaps 1216(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
-; AVX-NEXT: vmovaps 1184(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,1],ymm0[0,2],ymm1[7,5],ymm0[4,6]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm6[0,1,2],mem[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 640(%rdi), %ymm13
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1],ymm1[1,3],ymm13[4,5],ymm1[5,7]
+; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,2],ymm2[2,0],ymm5[4,6],ymm2[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 64(%rdi), %xmm0
+; AVX-NEXT: vmovaps 96(%rdi), %xmm9
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0]
+; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,1],ymm0[0,2],ymm2[7,5],ymm0[4,6]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = xmm4[0,1,2],mem[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,2,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm0[2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 192(%rdi), %ymm7
+; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,1],ymm0[1,3],ymm7[4,5],ymm0[5,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm3[0,2],ymm14[2,0],ymm3[4,6],ymm14[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm14[5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm3[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm14[0,0],ymm3[7,4],ymm14[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm7[1,0],ymm0[2,0],ymm7[5,4],ymm0[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm14[2,0],ymm3[6,4],ymm14[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm9[0,1,0,1]
+; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[0,1,2],xmm14[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm4[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm15 = mem[0],xmm15[1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,0],ymm10[2,0],ymm0[5,4],ymm10[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm9[2,0],ymm3[6,4],ymm9[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm10[0,1,0,1]
+; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX-NEXT: # xmm9 = mem[0,1,2],xmm9[3]
; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
; AVX-NEXT: # xmm14 = mem[2,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm14 = mem[0],xmm14[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm3[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[0,0],ymm1[7,4],ymm0[4,4]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4]
-; AVX-NEXT: vmovaps 1440(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm1[0,1,0,1]
-; AVX-NEXT: vmovaps 1408(%rdi), %xmm1
-; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX-NEXT: # xmm10 = mem[2,3,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm10 = mem[0],xmm10[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm5[3,0],ymm3[0,0],ymm5[7,4],ymm3[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,0],ymm1[2,0],ymm13[5,4],ymm1[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4]
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[0,1,0,1]
+; AVX-NEXT: vblendps $7, (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0,1,2],xmm4[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm2[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[3,0],ymm0[0,0],ymm2[7,4],ymm0[4,4]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,0],ymm15[2,0],ymm5[5,4],ymm15[6,4]
-; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm10[2,0],ymm0[6,4],ymm10[6,4]
-; AVX-NEXT: vmovaps 1664(%rdi), %xmm0
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm0[0,1,0,1]
-; AVX-NEXT: vmovaps 1632(%rdi), %xmm0
-; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT: # ymm4 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0,1,0,1]
+; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0,1,2],xmm4[3]
; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
; AVX-NEXT: # xmm9 = mem[2,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT: # ymm9 = ymm2[2,1],mem[3,3],ymm2[6,5],mem[7,7]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload
-; AVX-NEXT: # xmm10 = mem[0],xmm2[1],mem[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
-; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,0],ymm9[2,0],ymm10[5,4],ymm9[6,4]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = xmm14[0,1,2],mem[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,0],ymm12[1,0],ymm2[4,4],ymm12[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm8[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm13[2,1],ymm7[3,3],ymm13[6,5],ymm7[7,7]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX-NEXT: # xmm9 = mem[0],xmm9[1],mem[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm9[1,0],ymm8[2,0],ymm9[5,4],ymm8[6,4]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT: # xmm7 = mem[0,1,2],xmm7[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,0],ymm13[1,0],ymm10[4,4],ymm13[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm9[2,0],xmm7[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
-; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX-NEXT: # ymm7 = ymm7[2,1],mem[3,3],ymm7[6,5],mem[7,7]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = mem[0],xmm8[1],mem[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,0],ymm7[2,0],ymm8[5,4],ymm7[6,4]
-; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT: # xmm6 = mem[0,1,2],xmm6[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT: # ymm8 = ymm9[0,0],mem[1,0],ymm9[4,4],mem[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8
-; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,0],xmm6[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
-; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,0],ymm11[2,0],ymm7[5,4],ymm11[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0,1,0,1]
+; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0,1,2],xmm4[3]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm12[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT: # ymm6 = ymm6[2,1],mem[3,3],ymm6[6,5],mem[7,7]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT: # xmm7 = mem[0],xmm7[1],mem[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm7[1,0],ymm6[2,0],ymm7[5,4],ymm6[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm14[2,0],ymm6[5,4],ymm14[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm15[0,1,0,1]
+; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0,1,2],xmm4[3]
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = mem[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT: # ymm4 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0,1,0,1]
+; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0,1,2],xmm4[3]
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = mem[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm0[3,0],ymm3[0,0],ymm0[7,4],ymm3[4,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,0],ymm12[2,0],ymm1[5,4],ymm12[6,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm4[2,0],ymm3[6,4],ymm4[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm2[0,1,0,1]
; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
; AVX-NEXT: # xmm4 = mem[0,1,2],xmm4[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm7 # 32-byte Folded Reload
-; AVX-NEXT: # ymm7 = ymm8[0,0],mem[1,0],ymm8[4,4],mem[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm7[2,0],xmm4[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
-; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT: # ymm4 = ymm4[2,1],mem[3,3],ymm4[6,5],mem[7,7]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT: # xmm6 = mem[0],xmm6[1],mem[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm4[2,0],ymm6[5,4],ymm4[6,4]
-; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[0,1,2],xmm3[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT: # ymm6 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,0],xmm3[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = mem[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm5 = mem[0],xmm5[1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = ymm0[2,1],mem[3,3],ymm0[6,5],mem[7,7]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0],xmm0[1],mem[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = xmm0[0,1,2],mem[3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,0],ymm9[1,0],ymm0[4,4],ymm9[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT: # ymm3 = ymm11[2,1],mem[3,3],ymm11[6,5],mem[7,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7]
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
; AVX-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3]
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = xmm11[0,1,2],mem[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX-NEXT: # ymm6 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,0],xmm4[3,2]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = xmm10[0,1,2],mem[3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,0],ymm13[1,0],ymm11[4,4],ymm13[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[3,2]
; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -10829,278 +10750,347 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3]
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
-; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT: # ymm4 = ymm4[0,0],mem[1,0],ymm4[4,4],mem[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,0],xmm1[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,1],ymm15[3,3],ymm5[6,5],ymm15[7,7]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[0],xmm3[1],mem[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,0],ymm1[2,0],ymm3[5,4],ymm1[6,4]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT: # ymm3 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4]
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[3,2]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm14[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4]
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,0],ymm3[4,5],ymm2[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0,1,2],mem[3]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm10[1,0],ymm13[2,0],ymm10[5,4],ymm13[6,4]
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm1[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[3,0],ymm3[0,0],ymm1[7,4],ymm3[4,4]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = mem[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1,0,1]
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = xmm0[0,1,2],mem[3]
-; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
-; AVX-NEXT: # ymm3 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4]
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,0],xmm0[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[0,0],ymm4[7,4],ymm3[4,4]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = mem[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,0],ymm4[4,5],ymm3[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[0,1,0,1]
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = xmm3[0,1,2],mem[3]
-; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT: # ymm4 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4]
-; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,3]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vblendps $8, (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = xmm4[0,1,2],mem[3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm5[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm5[3,0],ymm4[0,0],ymm5[7,4],ymm4[4,4]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = mem[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,0],ymm8[4,5],ymm4[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = mem[0,1,0,1]
+; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = ymm8[2,1],mem[3,3],ymm8[6,5],mem[7,7]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
; AVX-NEXT: # xmm4 = xmm4[0,1,2],mem[3]
-; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
-; AVX-NEXT: # ymm8 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4]
-; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,0],xmm4[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm5[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm5[3,0],ymm8[0,0],ymm5[7,4],ymm8[4,4]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX-NEXT: # xmm9 = mem[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,0],ymm9[4,5],ymm8[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm8[4,5,6,7]
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm11[3]
+; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = ymm7[2,1],mem[3,3],ymm7[6,5],mem[7,7]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = xmm7[0,1,2],mem[3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT: # ymm9 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4]
-; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9
-; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,0],xmm4[2,3]
+; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm6[2,1],ymm14[3,3],ymm6[6,5],ymm14[7,7]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = xmm15[0,1,2],mem[3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,0],xmm4[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vshufps $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = ymm3[2,1],mem[3,3],ymm3[6,5],mem[7,7]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0],xmm4[1],mem[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,0],ymm3[2,0],ymm4[5,4],ymm3[6,4]
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = xmm15[0,1,2],mem[3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT: # ymm14 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm14[2,0],xmm4[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[2,1],ymm12[3,3],ymm1[6,5],ymm12[7,7]
+; AVX-NEXT: vmovaps %ymm1, %ymm12
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vblendps $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[0],xmm4[1],mem[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm14[1,0],ymm3[2,0],ymm14[5,4],ymm3[6,4]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm2[0,1,2],mem[3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm5[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm5[3,0],ymm9[0,0],ymm5[7,4],ymm9[4,4]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm4[0,0],ymm5[1,0],ymm4[4,4],ymm5[5,4]
+; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0],xmm14[3,2]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[0,1,0,1]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm10[0,1,2],mem[3]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm9[2,0],ymm0[5,4],ymm9[6,4]
+; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,0],xmm2[2,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm0[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm10[0,0],ymm0[7,4],ymm10[4,4]
; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
; AVX-NEXT: # xmm10 = mem[2,3,2,3]
; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
; AVX-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3]
; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
-; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7]
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX-NEXT: # xmm9 = mem[0,1,0,1]
-; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm9[0,1,2],mem[3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX-NEXT: # ymm9 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4]
-; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9
-; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,0],xmm5[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm7[2,3,0,1]
-; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm7[3,0],ymm9[0,0],ymm7[7,4],ymm9[4,4]
+; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = mem[0,1,0,1]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm2[0,1,2],mem[3]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm11[1,0],ymm13[2,0],ymm11[5,4],ymm13[6,4]
+; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,0],xmm2[2,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm1[3,0],ymm10[0,0],ymm1[7,4],ymm10[4,4]
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = mem[2,3,2,3]
+; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = xmm11[0],mem[1],xmm11[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[0,1,0,1]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = xmm10[0,1,2],mem[3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
+; AVX-NEXT: # ymm10 = ymm1[1,0],mem[2,0],ymm1[5,4],mem[6,4]
+; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm8[2,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm10[0,0],ymm1[7,4],ymm10[4,4]
; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
; AVX-NEXT: # xmm10 = mem[2,3,2,3]
; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
; AVX-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3]
; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
-; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,0],ymm10[4,5],ymm9[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7]
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX-NEXT: # xmm9 = mem[0,1,0,1]
-; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm15[3]
+; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,0],ymm10[4,5],ymm1[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = mem[0,1,0,1]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = xmm8[0,1,2],mem[3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload
; AVX-NEXT: # ymm10 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4]
; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
-; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm10[2,0],xmm9[2,3]
+; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm10[2,0],xmm8[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
; AVX-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm6[2,3,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm6[3,0],ymm10[0,0],ymm6[7,4],ymm10[4,4]
-; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = mem[2,3,2,3]
-; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,0],ymm12[4,5],ymm10[6,4]
-; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 192(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 128(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 64(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, (%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 224(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm10, 160(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm10, 96(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm10, 32(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 192(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 128(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 64(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, (%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 224(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 160(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 96(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 32(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 192(%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 128(%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 64(%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, (%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 224(%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 160(%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 96(%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 32(%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, (%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 64(%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 128(%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 192(%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 224(%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 160(%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 96(%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 32(%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 224(%r9)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 192(%r9)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 160(%r9)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 128(%r9)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 96(%r9)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 64(%r9)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 32(%r9)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, (%r9)
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 224(%rax)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 192(%rax)
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = mem[2,3,2,3]
+; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = xmm11[0],mem[1],xmm11[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
+; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,0],ymm11[4,5],ymm10[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm7[3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 160(%rax)
+; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT: # ymm11 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4]
+; AVX-NEXT: vextractf128 $1, %ymm11, %xmm11
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm11[2,0],xmm10[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 128(%rax)
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm6[3,0],ymm11[0,0],ymm6[7,4],ymm11[4,4]
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = mem[2,3,2,3]
+; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = xmm11[0],mem[1],xmm11[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
+; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,0],ymm11[4,5],ymm7[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[0,1,0,1]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = xmm10[0,1,2],mem[3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 96(%rax)
+; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
+; AVX-NEXT: # ymm11 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4]
+; AVX-NEXT: vextractf128 $1, %ymm11, %xmm11
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm11[2,0],xmm10[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 64(%rax)
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm6[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm6[3,0],ymm11[0,0],ymm6[7,4],ymm11[4,4]
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[2,3,2,3]
+; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT: vshufps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,0],ymm14[4,5],ymm11[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[0,1,0,1]
+; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm15[3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, 32(%rax)
+; AVX-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload
+; AVX-NEXT: # ymm14 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4]
+; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm14[2,0],xmm10[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm6, (%rax)
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm6[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm6[3,0],ymm14[0,0],ymm6[7,4],ymm14[4,4]
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[2,3,2,3]
+; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,0],ymm14[4,5],ymm13[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm13[4,5,6,7]
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[0,1,0,1]
+; AVX-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = xmm10[0,1,2],mem[3]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm4[1,0],ymm5[2,0],ymm4[5,4],ymm5[6,4]
+; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
+; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm14[2,0],xmm10[2,3]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm12[2,3,0,1]
+; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm12[3,0],ymm14[0,0],ymm12[7,4],ymm14[4,4]
+; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX-NEXT: # xmm9 = mem[2,3,2,3]
+; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX-NEXT: # xmm9 = xmm9[0],mem[1],xmm9[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
+; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,0],ymm9[4,5],ymm14[6,4]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 192(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 128(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 64(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, (%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 224(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm9, 160(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm9, 96(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm9, 32(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 192(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 128(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 64(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, (%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 224(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 160(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 96(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 32(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 192(%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 128(%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 64(%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, (%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 224(%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 160(%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 96(%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 32(%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, (%r8)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 64(%r8)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 128(%r8)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 192(%r8)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 224(%r8)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 160(%r8)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 96(%r8)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 32(%r8)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 224(%r9)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 192(%r9)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 160(%r9)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 128(%r9)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 96(%r9)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 64(%r9)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, 32(%r9)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm4, (%r9)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovaps %ymm9, 224(%rax)
-; AVX-NEXT: vmovaps %ymm5, 192(%rax)
-; AVX-NEXT: vmovaps %ymm4, 160(%rax)
-; AVX-NEXT: vmovaps %ymm8, 128(%rax)
+; AVX-NEXT: vmovaps %ymm3, 224(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 192(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 160(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 128(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm3, 96(%rax)
-; AVX-NEXT: vmovaps %ymm0, 64(%rax)
-; AVX-NEXT: vmovaps %ymm1, 32(%rax)
-; AVX-NEXT: vmovaps %ymm2, (%rax)
-; AVX-NEXT: addq $3176, %rsp # imm = 0xC68
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 64(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, 32(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm3, (%rax)
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: vmovaps %ymm14, 224(%rax)
+; AVX-NEXT: vmovaps %ymm13, 192(%rax)
+; AVX-NEXT: vmovaps %ymm11, 160(%rax)
+; AVX-NEXT: vmovaps %ymm7, 128(%rax)
+; AVX-NEXT: vmovaps %ymm8, 96(%rax)
+; AVX-NEXT: vmovaps %ymm1, 64(%rax)
+; AVX-NEXT: vmovaps %ymm2, 32(%rax)
+; AVX-NEXT: vmovaps %ymm0, (%rax)
+; AVX-NEXT: addq $3224, %rsp # imm = 0xC98
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
index 93a84e30412d6..0e3dab6580318 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
@@ -84,105 +84,113 @@ define void @load_i64_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512-LABEL: load_i64_stride3_vf2:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; AVX512-NEXT: vmovaps %xmm0, (%rsi)
-; AVX512-NEXT: vmovdqa %xmm2, (%rdx)
-; AVX512-NEXT: vmovdqa %xmm1, (%rcx)
+; AVX512-NEXT: vmovaps (%rdi), %zmm0
+; AVX512-NEXT: vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512-NEXT: vpermpd %zmm0, %zmm2, %zmm0
+; AVX512-NEXT: vmovaps 16(%rdi), %xmm2
+; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512-NEXT: vmovaps %xmm0, (%rdx)
+; AVX512-NEXT: vmovaps %xmm2, (%rcx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i64_stride3_vf2:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4]
-; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1
-; AVX512-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
-; AVX512-FCP-NEXT: vmovaps 16(%rdi), %xmm2
-; AVX512-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm0
+; AVX512-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512-FCP-NEXT: vpermpd %zmm0, %zmm2, %zmm2
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512-FCP-NEXT: vpermpd %zmm0, %zmm3, %zmm0
; AVX512-FCP-NEXT: vmovaps %xmm1, (%rsi)
-; AVX512-FCP-NEXT: vmovaps %xmm0, (%rdx)
-; AVX512-FCP-NEXT: vmovaps %xmm2, (%rcx)
+; AVX512-FCP-NEXT: vmovaps %xmm2, (%rdx)
+; AVX512-FCP-NEXT: vmovaps %xmm0, (%rcx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i64_stride3_vf2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; AVX512DQ-NEXT: vmovaps %xmm0, (%rsi)
-; AVX512DQ-NEXT: vmovdqa %xmm2, (%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm1, (%rcx)
+; AVX512DQ-NEXT: vmovaps (%rdi), %zmm0
+; AVX512DQ-NEXT: vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512DQ-NEXT: vpermpd %zmm0, %zmm2, %zmm0
+; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm2
+; AVX512DQ-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512DQ-NEXT: vmovaps %xmm0, (%rdx)
+; AVX512DQ-NEXT: vmovaps %xmm2, (%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i64_stride3_vf2:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4]
-; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1
-; AVX512DQ-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
-; AVX512DQ-FCP-NEXT: vmovaps 16(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512DQ-FCP-NEXT: vpermpd %zmm0, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512DQ-FCP-NEXT: vpermpd %zmm0, %zmm3, %zmm0
; AVX512DQ-FCP-NEXT: vmovaps %xmm1, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i64_stride3_vf2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
-; AVX512BW-NEXT: vmovdqa %xmm2, (%rdx)
-; AVX512BW-NEXT: vmovdqa %xmm1, (%rcx)
+; AVX512BW-NEXT: vmovaps (%rdi), %zmm0
+; AVX512BW-NEXT: vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512BW-NEXT: vpermpd %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm2
+; AVX512BW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512BW-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512BW-NEXT: vmovaps %xmm0, (%rdx)
+; AVX512BW-NEXT: vmovaps %xmm2, (%rcx)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i64_stride3_vf2:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4]
-; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
-; AVX512BW-FCP-NEXT: vmovaps 16(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512BW-FCP-NEXT: vpermpd %zmm0, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512BW-FCP-NEXT: vpermpd %zmm0, %zmm3, %zmm0
; AVX512BW-FCP-NEXT: vmovaps %xmm1, (%rsi)
-; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%rcx)
+; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rcx)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i64_stride3_vf2:
; AVX512DQ-BW: # %bb.0:
-; AVX512DQ-BW-NEXT: vpermpd {{.*#+}} zmm0 = mem[0,3,2,3,4,7,6,7]
-; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
-; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rsi)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rdx)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rcx)
+; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm0
+; AVX512DQ-BW-NEXT: vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512DQ-BW-NEXT: vpermpd %zmm0, %zmm2, %zmm0
+; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm2
+; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-BW-NEXT: vmovaps %xmm1, (%rsi)
+; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT: vmovaps %xmm2, (%rcx)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i64_stride3_vf2:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [1,4]
-; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[0,3,2,3,4,7,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovaps 16(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermpd {{.*#+}} zmm1 = zmm0[0,3,2,3,4,7,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,4]
+; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm0, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512DQ-BW-FCP-NEXT: vpermpd %zmm0, %zmm3, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm1, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <6 x i64>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
index 0c7c3f4b16646..e6fff38ade14b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-4.ll
@@ -103,18 +103,18 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512-LABEL: load_i64_stride4_vf2:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovaps (%rdi), %xmm0
-; AVX512-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX512-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512-NEXT: vmovaps 32(%rdi), %ymm1
-; AVX512-NEXT: vmovaps (%rdi), %ymm3
-; AVX512-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512-NEXT: vmovaps %xmm2, (%rsi)
-; AVX512-NEXT: vmovaps %xmm0, (%rdx)
+; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,4]
+; AVX512-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; AVX512-NEXT: vmovaps (%rdi), %xmm1
+; AVX512-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
+; AVX512-NEXT: vmovaps (%rdi), %ymm2
+; AVX512-NEXT: vmovaps 32(%rdi), %ymm3
+; AVX512-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512-NEXT: vmovaps %xmm1, (%rdx)
; AVX512-NEXT: vextractf128 $1, %ymm4, (%rcx)
-; AVX512-NEXT: vextractf128 $1, %ymm1, (%r8)
+; AVX512-NEXT: vextractf128 $1, %ymm2, (%r8)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -137,18 +137,18 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-LABEL: load_i64_stride4_vf2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX512DQ-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm1
-; AVX512DQ-NEXT: vmovaps (%rdi), %ymm3
-; AVX512DQ-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQ-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQ-NEXT: vmovaps %xmm2, (%rsi)
-; AVX512DQ-NEXT: vmovaps %xmm0, (%rdx)
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,4]
+; AVX512DQ-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; AVX512DQ-NEXT: vmovaps (%rdi), %xmm1
+; AVX512DQ-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
+; AVX512DQ-NEXT: vmovaps (%rdi), %ymm2
+; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm3
+; AVX512DQ-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512DQ-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512DQ-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512DQ-NEXT: vmovaps %xmm1, (%rdx)
; AVX512DQ-NEXT: vextractf128 $1, %ymm4, (%rcx)
-; AVX512DQ-NEXT: vextractf128 $1, %ymm1, (%r8)
+; AVX512DQ-NEXT: vextractf128 $1, %ymm2, (%r8)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -171,18 +171,18 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-LABEL: load_i64_stride4_vf2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
-; AVX512BW-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm1
-; AVX512BW-NEXT: vmovaps (%rdi), %ymm3
-; AVX512BW-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512BW-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512BW-NEXT: vmovaps %xmm2, (%rsi)
-; AVX512BW-NEXT: vmovaps %xmm0, (%rdx)
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,4]
+; AVX512BW-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovaps (%rdi), %xmm1
+; AVX512BW-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
+; AVX512BW-NEXT: vmovaps (%rdi), %ymm2
+; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm3
+; AVX512BW-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512BW-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512BW-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512BW-NEXT: vmovaps %xmm1, (%rdx)
; AVX512BW-NEXT: vextractf128 $1, %ymm4, (%rcx)
-; AVX512BW-NEXT: vextractf128 $1, %ymm1, (%r8)
+; AVX512BW-NEXT: vextractf128 $1, %ymm2, (%r8)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -205,18 +205,18 @@ define void @load_i64_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-BW-LABEL: load_i64_stride4_vf2:
; AVX512DQ-BW: # %bb.0:
-; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0
-; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm1
-; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm3
-; AVX512DQ-BW-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm1[0],ymm3[2],ymm1[2]
-; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
-; AVX512DQ-BW-NEXT: vmovaps %xmm2, (%rsi)
-; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [0,4]
+; AVX512DQ-BW-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm1
+; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1]
+; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm2
+; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm3
+; AVX512DQ-BW-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
+; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
+; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rsi)
+; AVX512DQ-BW-NEXT: vmovaps %xmm1, (%rdx)
; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm4, (%rcx)
-; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm1, (%r8)
+; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm2, (%r8)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
index 07988a416bac4..3abf5bbaea27b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
@@ -114,154 +114,178 @@ define void @load_i64_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512-LABEL: load_i64_stride5_vf2:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512-NEXT: vmovdqa (%rdi), %xmm2
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],mem[2,3]
+; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,7]
+; AVX512-NEXT: vpermq %zmm1, %zmm3, %zmm3
+; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512-NEXT: vpermi2q %zmm0, %zmm1, %zmm6
; AVX512-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX512-NEXT: vmovdqa %xmm2, (%rdx)
; AVX512-NEXT: vmovdqa %xmm3, (%rcx)
-; AVX512-NEXT: vmovdqa %xmm1, (%r8)
-; AVX512-NEXT: vmovdqa %xmm2, (%r9)
+; AVX512-NEXT: vmovdqa %xmm5, (%r8)
+; AVX512-NEXT: vmovdqa %xmm6, (%r9)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i64_stride5_vf2:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
-; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa %xmm1, (%r8)
-; AVX512-FCP-NEXT: vmovdqa %xmm2, (%r9)
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm2
+; AVX512-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,6]
+; AVX512-FCP-NEXT: vpermq %zmm0, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [2,7]
+; AVX512-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
+; AVX512-FCP-NEXT: vmovaps %xmm2, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r8)
+; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r9)
+; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i64_stride5_vf2:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2
; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],mem[2,3]
+; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,7]
+; AVX512DQ-NEXT: vpermq %zmm1, %zmm3, %zmm3
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm1, %zmm6
; AVX512DQ-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm2, (%rdx)
; AVX512DQ-NEXT: vmovdqa %xmm3, (%rcx)
-; AVX512DQ-NEXT: vmovdqa %xmm1, (%r8)
-; AVX512DQ-NEXT: vmovdqa %xmm2, (%r9)
+; AVX512DQ-NEXT: vmovdqa %xmm5, (%r8)
+; AVX512DQ-NEXT: vmovdqa %xmm6, (%r9)
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i64_stride5_vf2:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,6]
+; AVX512DQ-FCP-NEXT: vpermq %zmm0, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [2,7]
+; AVX512DQ-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r9)
+; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i64_stride5_vf2:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],mem[2,3]
+; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,7]
+; AVX512BW-NEXT: vpermq %zmm1, %zmm3, %zmm3
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm6
; AVX512BW-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX512BW-NEXT: vmovdqa %xmm2, (%rdx)
; AVX512BW-NEXT: vmovdqa %xmm3, (%rcx)
-; AVX512BW-NEXT: vmovdqa %xmm1, (%r8)
-; AVX512BW-NEXT: vmovdqa %xmm2, (%r9)
+; AVX512BW-NEXT: vmovdqa %xmm5, (%r8)
+; AVX512BW-NEXT: vmovdqa %xmm6, (%r9)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i64_stride5_vf2:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
-; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm2
+; AVX512BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,6]
+; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [2,7]
+; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
+; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
+; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i64_stride5_vf2:
; AVX512DQ-BW: # %bb.0:
-; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2
+; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm2
; AVX512DQ-BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],mem[2,3]
+; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,7]
+; AVX512DQ-BW-NEXT: vpermq %zmm1, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm1, %zmm6
; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rdx)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rdx)
; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rcx)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%r8)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%r9)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9)
+; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i64_stride5_vf2:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm2[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [1,6]
+; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [2,7]
+; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [3,8]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [4,9]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <10 x i64>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <10 x i64> %wide.vec, <10 x i64> poison, <2 x i32> <i32 0, i32 5>
@@ -501,24 +525,24 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0]
; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4]
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
-; AVX512-FCP-NEXT: vpermi2q %ymm5, %ymm3, %ymm4
-; AVX512-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0]
-; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0]
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm4
+; AVX512-FCP-NEXT: vpbroadcastq %xmm4, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0]
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0]
+; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0]
; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm6
-; AVX512-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0]
-; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7]
; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r8)
+; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %ymm5, (%r8)
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%r9)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -561,24 +585,24 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0]
; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4]
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT: vpermi2q %ymm5, %ymm3, %ymm4
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0]
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT: vpbroadcastq %xmm4, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512DQ-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0]
; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm6
-; AVX512DQ-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7]
; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%r8)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -621,24 +645,24 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0]
; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4]
-; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
-; AVX512BW-FCP-NEXT: vpermi2q %ymm5, %ymm3, %ymm4
-; AVX512BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0]
+; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm4
+; AVX512BW-FCP-NEXT: vpbroadcastq %xmm4, %ymm5
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm6
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0]
; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm6
-; AVX512BW-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm7
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7]
; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r8)
; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%r9)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
@@ -681,24 +705,24 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,6,11,0]
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,1,2,4]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm5, %ymm3, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [2,7,12,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,0,5,0]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm4
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastq %xmm4, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [2,7,12,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [11,0,5,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [12,1,6,0]
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 144(%rdi), %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [12,1,6,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r8)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%r9)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
index 7d3209397c3df..55be6ea0c5722 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
@@ -587,7 +587,7 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
; AVX512-NEXT: vpbroadcastq 136(%rdi), %ymm8
-; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11]
; AVX512-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
@@ -614,27 +614,26 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
-; AVX512-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5
-; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
-; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
-; AVX512-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10]
-; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8
-; AVX512-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm5
+; AVX512-FCP-NEXT: vpbroadcastq %xmm5, %ymm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0]
+; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,8,14]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [4,10]
+; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7
+; AVX512-FCP-NEXT: vpermt2q 128(%rdi), %zmm6, %zmm7
+; AVX512-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11]
; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa %ymm4, (%r8)
+; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %ymm5, (%r8)
; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r9)
; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512-FCP-NEXT: vzeroupper
@@ -669,7 +668,7 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
; AVX512DQ-NEXT: vpbroadcastq 136(%rdi), %ymm8
-; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11]
; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
@@ -696,27 +695,26 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
-; AVX512DQ-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vpbroadcastq %xmm5, %ymm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,8,14]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [4,10]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7
+; AVX512DQ-FCP-NEXT: vpermt2q 128(%rdi), %zmm6, %zmm7
+; AVX512DQ-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11]
; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%r8)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r9)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
@@ -751,7 +749,7 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
; AVX512BW-NEXT: vpbroadcastq 136(%rdi), %ymm8
-; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11]
; AVX512BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
@@ -778,27 +776,26 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
-; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
-; AVX512BW-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5
-; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
-; AVX512BW-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8
-; AVX512BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
+; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm5
+; AVX512BW-FCP-NEXT: vpbroadcastq %xmm5, %ymm6
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm6
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,8,14]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [4,10]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7
+; AVX512BW-FCP-NEXT: vpermt2q 128(%rdi), %zmm6, %zmm7
+; AVX512BW-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm6
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11]
; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r8)
; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r9)
; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
@@ -833,7 +830,7 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
; AVX512DQ-BW-NEXT: vpbroadcastq 136(%rdi), %ymm8
-; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm7 = ymm8[1],ymm7[1],ymm8[3],ymm7[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11]
; AVX512DQ-BW-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm7[4,5,6,7]
@@ -860,27 +857,26 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm4 = [10,0,6,0]
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,2,4]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm6, %ymm4, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [11,1,7,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,4,0,6]
-; AVX512DQ-BW-FCP-NEXT: vpermq 128(%rdi), %zmm7, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,10]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastq %xmm5, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [11,1,7,0]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,1,8,14]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [4,10]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpermt2q 128(%rdi), %zmm6, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 136(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [5,11]
; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm8
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm6[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r8)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r9)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index cc3e5f3d1d82e..0d4e1a831cd71 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -48,26 +48,25 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX: # %bb.0:
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX-NEXT: vmovapd 16(%rdi), %xmm0
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm0
; AVX-NEXT: vmovaps 48(%rdi), %xmm1
; AVX-NEXT: vmovdqa 64(%rdi), %xmm2
-; AVX-NEXT: vmovapd 80(%rdi), %xmm3
+; AVX-NEXT: vmovdqa 80(%rdi), %xmm3
; AVX-NEXT: vblendps {{.*#+}} xmm4 = mem[0,1],xmm1[2,3]
; AVX-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm0[0],mem[1]
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
-; AVX-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1,2,3],mem[4,5,6,7]
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = mem[0,1,2,3],xmm3[4,5,6,7]
; AVX-NEXT: vmovdqa 96(%rdi), %xmm6
; AVX-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
; AVX-NEXT: vmovaps %xmm4, (%rsi)
; AVX-NEXT: vmovdqa %xmm2, (%rdx)
-; AVX-NEXT: vmovapd %xmm5, (%rcx)
-; AVX-NEXT: vmovapd %xmm0, (%r8)
-; AVX-NEXT: vmovapd %xmm3, (%r9)
+; AVX-NEXT: vmovdqa %xmm5, (%rcx)
+; AVX-NEXT: vmovdqa %xmm0, (%r8)
+; AVX-NEXT: vmovdqa %xmm3, (%r9)
; AVX-NEXT: vmovdqa %xmm6, (%r10)
; AVX-NEXT: vmovaps %xmm1, (%rax)
-; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
; AVX2-LABEL: load_i64_stride7_vf2:
@@ -155,216 +154,236 @@ define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512: # %bb.0:
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4
+; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3]
+; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3]
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm6
-; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
+; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3]
; AVX512-NEXT: vmovdqa 96(%rdi), %xmm7
-; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512-NEXT: vmovdqa %xmm5, (%rcx)
-; AVX512-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512-NEXT: vpermi2q 64(%rdi), %zmm0, %zmm7
+; AVX512-NEXT: vmovdqa %xmm5, (%rsi)
+; AVX512-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX512-NEXT: vmovdqa %xmm4, (%rcx)
+; AVX512-NEXT: vmovdqa %xmm2, (%r8)
; AVX512-NEXT: vmovdqa %xmm6, (%r9)
-; AVX512-NEXT: vmovdqa %xmm2, (%r10)
-; AVX512-NEXT: vmovdqa %xmm3, (%rax)
+; AVX512-NEXT: vmovdqa %xmm3, (%r10)
+; AVX512-NEXT: vmovdqa %xmm7, (%rax)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i64_stride7_vf2:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; AVX512-FCP-NEXT: vpermq %zmm1, %zmm2, %zmm2
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm6
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,12]
+; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm8
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
+; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r8)
; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r9)
-; AVX512-FCP-NEXT: vmovdqa %xmm2, (%r10)
-; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rax)
+; AVX512-FCP-NEXT: vmovdqa %xmm8, (%r10)
+; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rax)
+; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i64_stride7_vf2:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3]
+; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3]
; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm6
-; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
+; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3]
; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm7
-; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512DQ-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-NEXT: vmovdqa %xmm5, (%rcx)
-; AVX512DQ-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512DQ-NEXT: vpermi2q 64(%rdi), %zmm0, %zmm7
+; AVX512DQ-NEXT: vmovdqa %xmm5, (%rsi)
+; AVX512DQ-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm4, (%rcx)
+; AVX512DQ-NEXT: vmovdqa %xmm2, (%r8)
; AVX512DQ-NEXT: vmovdqa %xmm6, (%r9)
-; AVX512DQ-NEXT: vmovdqa %xmm2, (%r10)
-; AVX512DQ-NEXT: vmovdqa %xmm3, (%rax)
+; AVX512DQ-NEXT: vmovdqa %xmm3, (%r10)
+; AVX512DQ-NEXT: vmovdqa %xmm7, (%rax)
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i64_stride7_vf2:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; AVX512DQ-FCP-NEXT: vpermq %zmm1, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,12]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm8
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r8)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%r10)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%r10)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rax)
+; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i64_stride7_vf2:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3]
+; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3]
; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm6
-; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
+; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3]
; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm7
-; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512BW-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512BW-NEXT: vmovdqa %xmm5, (%rcx)
-; AVX512BW-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512BW-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512BW-NEXT: vpermi2q 64(%rdi), %zmm0, %zmm7
+; AVX512BW-NEXT: vmovdqa %xmm5, (%rsi)
+; AVX512BW-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx)
+; AVX512BW-NEXT: vmovdqa %xmm2, (%r8)
; AVX512BW-NEXT: vmovdqa %xmm6, (%r9)
-; AVX512BW-NEXT: vmovdqa %xmm2, (%r10)
-; AVX512BW-NEXT: vmovdqa %xmm3, (%rax)
+; AVX512BW-NEXT: vmovdqa %xmm3, (%r10)
+; AVX512BW-NEXT: vmovdqa %xmm7, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i64_stride7_vf2:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm2, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm6
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
-; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm7
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,12]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm8
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%r10)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%rax)
+; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i64_stride7_vf2:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2
+; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm4
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3]
+; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3]
; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm6
-; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
+; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3]
; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm7
-; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%rcx)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512DQ-BW-NEXT: vpermi2q 64(%rdi), %zmm0, %zmm7
+; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%rsi)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%r8)
; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%r10)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rax)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%r10)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%rax)
+; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf2:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
+; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm2, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [1,12]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm7, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [6,13]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%r10)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <14 x i64>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <14 x i64> %wide.vec, <14 x i64> poison, <2 x i32> <i32 0, i32 7>
@@ -448,7 +467,7 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovapd 16(%rdi), %xmm7
; AVX-NEXT: vmovapd 48(%rdi), %xmm3
; AVX-NEXT: vmovdqa 64(%rdi), %xmm8
-; AVX-NEXT: vmovapd 80(%rdi), %xmm9
+; AVX-NEXT: vmovdqa 80(%rdi), %xmm9
; AVX-NEXT: vblendpd {{.*#+}} xmm10 = mem[0],xmm3[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3]
; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[3],ymm4[2]
@@ -459,14 +478,14 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3]
; AVX-NEXT: vblendpd {{.*#+}} xmm10 = xmm7[0],mem[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,3]
-; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[3]
; AVX-NEXT: vmovdqa 192(%rdi), %xmm10
; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3]
+; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm8
; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm2[0,1,2],ymm8[3]
-; AVX-NEXT: vblendpd {{.*#+}} xmm9 = mem[0],xmm9[1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm9 = mem[0,1,2,3],xmm9[4,5,6,7]
; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3]
; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[3],ymm0[2]
; AVX-NEXT: vmovdqa 96(%rdi), %xmm9
@@ -478,7 +497,7 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovapd %ymm6, (%rsi)
; AVX-NEXT: vmovapd %ymm5, (%rdx)
; AVX-NEXT: vmovapd %ymm4, (%rcx)
-; AVX-NEXT: vmovapd %ymm7, (%r8)
+; AVX-NEXT: vmovaps %ymm7, (%r8)
; AVX-NEXT: vmovapd %ymm8, (%r9)
; AVX-NEXT: vmovapd %ymm2, (%r10)
; AVX-NEXT: vmovapd %ymm0, (%rax)
@@ -698,51 +717,51 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
-; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1
+; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm1
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
-; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
-; AVX512-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
-; AVX512-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm7
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm1
+; AVX512-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,8,15]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [2,5]
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm7
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm8
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512-FCP-NEXT: vpermi2q %ymm8, %ymm7, %ymm6
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm6
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm5
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm10[2,3]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,11]
+; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
-; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
+; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
-; AVX512-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13]
-; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT: vpermi2q 192(%rdi), %zmm2, %zmm9
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [6,13]
+; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r8)
-; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r9)
+; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512-FCP-NEXT: vmovdqa %ymm5, (%r9)
; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r10)
-; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rax)
+; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -803,51 +822,51 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm1
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
-; AVX512DQ-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,8,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [2,5]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512DQ-FCP-NEXT: vpermi2q %ymm8, %ymm7, %ymm6
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm10[2,3]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,11]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
-; AVX512DQ-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermi2q 192(%rdi), %zmm2, %zmm9
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [6,13]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%r9)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r10)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -908,51 +927,51 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3
-; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1
+; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm1
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
-; AVX512BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
-; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm1
+; AVX512BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm5
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,8,15]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [2,5]
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm7
+; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm8
+; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512BW-FCP-NEXT: vpermi2q %ymm8, %ymm7, %ymm6
+; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm6
+; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm5
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm10[2,3]
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,11]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
+; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
-; AVX512BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm2, %zmm9
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [6,13]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r9)
; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r10)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -1013,51 +1032,51 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
-; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm3, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,1,8,15]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [2,5]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm8, %ymm7, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm10 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[2,3],ymm10[2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [4,11]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm9
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm2, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [6,13]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r9)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r10)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <28 x i64>, ptr %in.vec, align 64
@@ -1214,7 +1233,7 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovapd 48(%rdi), %xmm1
; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 64(%rdi), %xmm12
-; AVX-NEXT: vmovapd 80(%rdi), %xmm10
+; AVX-NEXT: vmovdqa 80(%rdi), %xmm10
; AVX-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm1[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1243,28 +1262,28 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovapd 240(%rdi), %xmm14
; AVX-NEXT: vblendpd {{.*#+}} xmm15 = xmm14[0],mem[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0,1],ymm9[2,3]
-; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[1],ymm10[0],ymm11[2],ymm10[3]
; AVX-NEXT: vmovdqa 192(%rdi), %xmm15
; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
-; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3]
-; AVX-NEXT: vmovapd 304(%rdi), %xmm13
-; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[1],ymm13[0],ymm14[2],ymm13[3]
-; AVX-NEXT: vmovdqa 416(%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vpalignr {{.*#+}} xmm11 = xmm11[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm13[4,5,6,7]
+; AVX-NEXT: vmovdqa 416(%rdi), %xmm13
+; AVX-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
-; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3]
+; AVX-NEXT: vmovdqa 304(%rdi), %xmm1
+; AVX-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14
; AVX-NEXT: vmovapd 128(%rdi), %ymm15
; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3]
-; AVX-NEXT: vblendpd {{.*#+}} xmm10 = mem[0],xmm10[1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm10 = mem[0,1,2,3],xmm10[4,5,6,7]
; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
; AVX-NEXT: vmovapd 352(%rdi), %ymm14
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3]
-; AVX-NEXT: vmovapd 256(%rdi), %xmm1
-; AVX-NEXT: vblendpd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3]
+; AVX-NEXT: vmovdqa 256(%rdi), %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3]
; AVX-NEXT: vmovapd 192(%rdi), %ymm13
; AVX-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0],ymm13[1],ymm15[3],ymm13[2]
; AVX-NEXT: vmovdqa 96(%rdi), %xmm5
@@ -1273,8 +1292,8 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovapd 416(%rdi), %ymm15
; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[3],ymm15[2]
; AVX-NEXT: vmovdqa 320(%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3]
+; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3]
; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
; AVX-NEXT: # ymm13 = mem[0,1,2],ymm13[3]
; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -1291,12 +1310,12 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovapd %ymm6, (%rdx)
; AVX-NEXT: vmovapd %ymm9, 32(%rcx)
; AVX-NEXT: vmovapd %ymm8, (%rcx)
-; AVX-NEXT: vmovapd %ymm12, 32(%r8)
-; AVX-NEXT: vmovapd %ymm11, (%r8)
-; AVX-NEXT: vmovapd %ymm2, 32(%r9)
+; AVX-NEXT: vmovaps %ymm12, 32(%r8)
+; AVX-NEXT: vmovaps %ymm11, (%r8)
+; AVX-NEXT: vmovapd %ymm1, 32(%r9)
; AVX-NEXT: vmovapd %ymm10, (%r9)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovapd %ymm1, 32(%rax)
+; AVX-NEXT: vmovapd %ymm2, 32(%rax)
; AVX-NEXT: vmovapd %ymm5, (%rax)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: vmovapd %ymm0, 32(%rax)
@@ -1730,106 +1749,107 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1
; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm10
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
-; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
-; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
-; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm7
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,7,14,0,0,7,14,0]
+; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
+; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3
+; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm3
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [2,5]
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm9
; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm11
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
-; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11]
-; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12]
-; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10
-; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
-; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm9, %zmm8
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm10
+; AVX512-FCP-NEXT: vpermi2q %ymm11, %ymm9, %ymm8
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9
+; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3,4,5],ymm11[6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11]
+; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12]
+; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm12
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,13]
+; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm11
+; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm7
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm5
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,5,6,13,4,5,6,13]
+; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm6
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm5
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm10[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512-FCP-NEXT: movb $24, %r10b
-; AVX512-FCP-NEXT: kmovw %r10d, %k2
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3]
-; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
+; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm13
+; AVX512-FCP-NEXT: movb $24, %dil
+; AVX512-FCP-NEXT: kmovw %edi, %k2
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm13[4,5,4,5],zmm10[4,5,4,5]
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [10,3,10,3,10,3,10,3]
+; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm14
; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9]
; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm15
-; AVX512-FCP-NEXT: movb $-32, %r10b
-; AVX512-FCP-NEXT: kmovw %r10d, %k1
-; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1}
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6]
-; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
+; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm14, %zmm15
+; AVX512-FCP-NEXT: movb $-32, %dil
+; AVX512-FCP-NEXT: kmovw %edi, %k1
+; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1}
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,6,13,6,13,6,13,6]
+; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [14,0,0,7,14,0,0,7]
; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11]
-; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14
-; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2}
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
-; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
-; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
-; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
-; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
-; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
-; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm13, %zmm15
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm10, %zmm16
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,4,11]
+; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm13, %zmm12
+; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm13
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2}
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,4,11,4,11,4,11,4]
+; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,5,6,10,0,5,6,10]
+; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm13
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1}
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5]
+; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,5,6,11,0,5,6,11]
+; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm13
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm15, %zmm8
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1}
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm14
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,12,0,5,6,12]
+; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm14, %zmm10
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm16, %zmm9
+; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1}
+; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
+; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm10
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm4
; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm12[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r9)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -1948,106 +1968,107 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1
; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm10
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
-; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm7
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm8 = [2,5]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm9
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm9, %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm10
+; AVX512DQ-FCP-NEXT: vpermi2q %ymm11, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm12
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [6,13]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm11
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm7
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm10[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512DQ-FCP-NEXT: movb $24, %r10b
-; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
+; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm13
+; AVX512DQ-FCP-NEXT: movb $24, %dil
+; AVX512DQ-FCP-NEXT: kmovw %edi, %k2
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm13[4,5,4,5],zmm10[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm14
; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9]
; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm15
-; AVX512DQ-FCP-NEXT: movb $-32, %r10b
-; AVX512DQ-FCP-NEXT: kmovw %r10d, %k1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1}
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm14, %zmm15
+; AVX512DQ-FCP-NEXT: movb $-32, %dil
+; AVX512DQ-FCP-NEXT: kmovw %edi, %k1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [14,0,0,7,14,0,0,7]
; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2}
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm13, %zmm15
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm13, %zmm10, %zmm16
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm12 = [0,0,4,11]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm13, %zmm12
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm13
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm15, %zmm8
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm8 {%k1}
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm14
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm14, %zmm10
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm16, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm9 {%k1}
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm10
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm4
; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm12[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rdi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -2166,105 +2187,106 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1
; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2
-; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
-; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
-; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,7,14,0,0,7,14,0]
+; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
+; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
+; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm9
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm9[6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [4,11]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,12]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [6,13]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9
+; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm7
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm5
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,5,6,13,4,5,6,13]
; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
-; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7
-; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
-; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12
-; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10
-; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
-; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8
+; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm6
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm6, %zmm5
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11
+; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12
; AVX512BW-FCP-NEXT: movb $24, %r11b
; AVX512BW-FCP-NEXT: kmovd %r11d, %k2
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3]
-; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9]
-; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm14
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm12[4,5,4,5],zmm11[4,5,4,5]
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
+; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,5,6,9,0,5,6,9]
+; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm13
; AVX512BW-FCP-NEXT: movb $-32, %r11b
; AVX512BW-FCP-NEXT: kmovd %r11d, %k1
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1}
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6]
-; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 {%k1}
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6]
+; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [14,0,0,7,14,0,0,7]
; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm14
-; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm15
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13
-; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2}
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
-; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
-; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
-; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
-; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
-; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
-; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm4
-; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm5
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
-; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
+; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm14
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
+; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm11, %zmm15
+; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm8[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,4,11]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm6
+; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm12
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 {%k2}
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,4,11,4,11,4,11,4]
+; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,6,10,0,5,6,10]
+; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm11
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1}
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5]
+; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,6,11,0,5,6,11]
+; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm11
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [2,5]
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm12
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm16
+; AVX512BW-FCP-NEXT: vpermi2q %ymm16, %ymm12, %ymm10
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm14, %zmm10
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1}
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,6,12,0,5,6,12]
+; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm15, %zmm12
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14]
+; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm11
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm4
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,9,2,9,2,9,2,9]
+; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm1
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7]
; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
@@ -2384,105 +2406,106 @@ define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
-; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm8[0,1,2,3,4,5],ymm9[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [4,11]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,12]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [6,13]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [4,5,6,13,4,5,6,13]
; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm5, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm6, %zmm5
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12
; AVX512DQ-BW-FCP-NEXT: movb $24, %r11b
; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9]
-; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm12[4,5,4,5],zmm11[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm13
; AVX512DQ-BW-FCP-NEXT: movb $-32, %r11b
; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [14,0,0,7,14,0,0,7]
; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
-; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm11, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm8[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,4,11]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm12, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm13, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm3 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm10, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [2,5]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm16
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm16, %ymm12, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm14, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm16[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm16[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm15, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm11, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [2,9,2,9,2,9,2,9]
+; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm6[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
@@ -2828,179 +2851,179 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 160(%rdi), %ymm2
; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
-; AVX-NEXT: vmovdqa 64(%rdi), %xmm6
-; AVX-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
+; AVX-NEXT: vmovdqa 64(%rdi), %xmm3
+; AVX-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3]
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 608(%rdi), %ymm8
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[3],ymm8[2]
-; AVX-NEXT: vmovdqa 512(%rdi), %xmm9
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 608(%rdi), %ymm6
+; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[3],ymm6[2]
+; AVX-NEXT: vmovdqa 512(%rdi), %xmm8
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 352(%rdi), %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3]
-; AVX-NEXT: vmovapd 240(%rdi), %xmm3
-; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm3[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 800(%rdi), %xmm7
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3]
-; AVX-NEXT: vmovapd 688(%rdi), %xmm5
-; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm5[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 352(%rdi), %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm1[0,1,2],ymm5[3]
+; AVX-NEXT: vmovapd 240(%rdi), %xmm1
+; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm1[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3]
+; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 800(%rdi), %xmm5
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7
+; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3]
+; AVX-NEXT: vmovapd 688(%rdi), %xmm7
+; AVX-NEXT: vblendpd {{.*#+}} xmm9 = xmm7[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3]
+; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa 128(%rdi), %xmm4
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3]
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm10
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm6[4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 576(%rdi), %xmm2
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm9
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3]
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm9
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm3[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 576(%rdi), %xmm10
+; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm2
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3]
; AVX-NEXT: vmovdqa 464(%rdi), %xmm6
-; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm9[4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 752(%rdi), %xmm12
-; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[1],ymm12[0],ymm5[2],ymm12[3]
-; AVX-NEXT: vmovdqa 864(%rdi), %xmm5
-; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3]
-; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT: vmovapd 304(%rdi), %xmm7
-; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[3]
-; AVX-NEXT: vmovdqa 416(%rdi), %xmm8
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 80(%rdi), %xmm11
-; AVX-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[1],ymm11[0],ymm10[2],ymm11[3]
-; AVX-NEXT: vmovdqa 192(%rdi), %xmm10
-; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 528(%rdi), %xmm4
-; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[3]
-; AVX-NEXT: vmovdqa 640(%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm6[0,1],ymm2[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm2
-; AVX-NEXT: vmovapd 352(%rdi), %ymm6
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3]
-; AVX-NEXT: vmovapd 256(%rdi), %xmm8
-; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm8[0],xmm7[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2
-; AVX-NEXT: vmovapd 800(%rdi), %ymm3
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3]
-; AVX-NEXT: vmovapd 704(%rdi), %xmm5
-; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm5[0],xmm12[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm7[0,1],ymm2[2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm8[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 864(%rdi), %xmm3
+; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm5[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5
+; AVX-NEXT: vmovdqa 752(%rdi), %xmm12
+; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; AVX-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
+; AVX-NEXT: vmovdqa 416(%rdi), %xmm5
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT: vmovapd 576(%rdi), %ymm7
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3]
-; AVX-NEXT: vmovapd 480(%rdi), %xmm1
-; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm4[0,1],ymm0[2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0
-; AVX-NEXT: vmovapd 128(%rdi), %ymm10
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3]
-; AVX-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm11[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm4[0,1],ymm0[2,3]
-; AVX-NEXT: vmovapd 416(%rdi), %ymm2
-; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm2[1],ymm6[3],ymm2[2]
+; AVX-NEXT: vmovdqa 304(%rdi), %xmm7
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 192(%rdi), %xmm11
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm4
+; AVX-NEXT: vmovdqa 80(%rdi), %xmm13
+; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm9[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm8[0,1,2,3],ymm4[4,5,6,7]
+; AVX-NEXT: vmovdqa 640(%rdi), %xmm4
+; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm10[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT: vmovdqa 528(%rdi), %xmm0
+; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm6[0,1,2,3],ymm8[4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX-NEXT: vmovapd 352(%rdi), %ymm6
+; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3]
+; AVX-NEXT: vmovdqa 256(%rdi), %xmm8
+; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0,1],ymm5[2,3]
+; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT: vmovapd 800(%rdi), %ymm2
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3]
+; AVX-NEXT: vmovdqa 704(%rdi), %xmm5
+; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm5[0,1,2,3],xmm12[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm7[0,1],ymm3[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3
+; AVX-NEXT: vmovapd 576(%rdi), %ymm12
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0,1,2],ymm3[3]
+; AVX-NEXT: vmovdqa 480(%rdi), %xmm1
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm0[0,1],ymm3[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0
+; AVX-NEXT: vmovapd 128(%rdi), %ymm11
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = mem[0,1,2,3],xmm13[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm3[0,1],ymm0[2,3]
+; AVX-NEXT: vmovapd 416(%rdi), %ymm4
+; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm6[0],ymm4[1],ymm6[3],ymm4[2]
; AVX-NEXT: vmovdqa 320(%rdi), %xmm6
; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3]
-; AVX-NEXT: vmovapd 864(%rdi), %ymm4
-; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[3],ymm4[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm3[2,3]
+; AVX-NEXT: vmovapd 864(%rdi), %ymm3
+; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[3],ymm3[2]
; AVX-NEXT: vmovdqa 768(%rdi), %xmm0
; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm3[2,3]
-; AVX-NEXT: vmovdqa 544(%rdi), %xmm3
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 640(%rdi), %ymm3
-; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[1],ymm7[3],ymm3[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm1[0,1],ymm7[2,3]
-; AVX-NEXT: vmovapd 192(%rdi), %ymm1
-; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[3],ymm1[2]
-; AVX-NEXT: vmovdqa 96(%rdi), %xmm15
-; AVX-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3]
-; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT: # ymm2 = mem[0,1,2],ymm2[3]
-; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm5[0,1],ymm2[2,3]
+; AVX-NEXT: vmovdqa 544(%rdi), %xmm2
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 640(%rdi), %ymm5
+; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm12[0],ymm5[1],ymm12[3],ymm5[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT: vmovapd 192(%rdi), %ymm2
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm11[0],ymm2[1],ymm11[3],ymm2[2]
+; AVX-NEXT: vmovdqa 96(%rdi), %xmm11
+; AVX-NEXT: vpalignr {{.*#+}} xmm11 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3]
; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
; AVX-NEXT: # ymm4 = mem[0,1,2],ymm4[3]
-; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3]
-; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT: # ymm1 = mem[0,1,2],ymm1[3]
-; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3]
+; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3]
; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX-NEXT: # ymm3 = mem[0,1,2],ymm3[3]
-; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, 64(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, (%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, 96(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, 32(%rsi)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, 64(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, (%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, 96(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, 32(%rdx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, 64(%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, (%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, 96(%rcx)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, 32(%rcx)
-; AVX-NEXT: vmovapd %ymm13, 64(%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, (%r8)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, 32(%r8)
-; AVX-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm4, 96(%r8)
-; AVX-NEXT: vmovapd %ymm11, (%r9)
+; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3]
+; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT: # ymm2 = mem[0,1,2],ymm2[3]
+; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
+; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = mem[0,1,2],ymm5[3]
+; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm5[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, 64(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, (%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, 96(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, 32(%rsi)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, 64(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, (%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, 96(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, 32(%rdx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, 64(%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, (%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, 96(%rcx)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, 32(%rcx)
+; AVX-NEXT: vmovaps %ymm14, 64(%r8)
+; AVX-NEXT: vmovaps %ymm15, (%r8)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, 32(%r8)
+; AVX-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, 96(%r8)
+; AVX-NEXT: vmovapd %ymm13, (%r9)
; AVX-NEXT: vmovapd %ymm9, 64(%r9)
-; AVX-NEXT: vmovapd %ymm12, 96(%r9)
-; AVX-NEXT: vmovapd %ymm14, 32(%r9)
+; AVX-NEXT: vmovapd %ymm10, 96(%r9)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm5, 32(%r9)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovapd %ymm10, (%rax)
-; AVX-NEXT: vmovapd %ymm7, 64(%rax)
-; AVX-NEXT: vmovapd %ymm5, 96(%rax)
+; AVX-NEXT: vmovapd %ymm1, (%rax)
+; AVX-NEXT: vmovapd %ymm12, 64(%rax)
+; AVX-NEXT: vmovapd %ymm7, 96(%rax)
; AVX-NEXT: vmovapd %ymm8, 32(%rax)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: vmovapd %ymm3, 64(%rax)
-; AVX-NEXT: vmovapd %ymm1, (%rax)
+; AVX-NEXT: vmovapd %ymm2, (%rax)
; AVX-NEXT: vmovapd %ymm0, 96(%rax)
-; AVX-NEXT: vmovapd %ymm2, 32(%rax)
+; AVX-NEXT: vmovapd %ymm4, 32(%rax)
; AVX-NEXT: addq $552, %rsp # imm = 0x228
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -3892,153 +3915,158 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm30
+; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm31
; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6
-; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26
-; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3
-; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27
-; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
+; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28
+; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3
; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13
; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15
; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm28
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31
-; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm31
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm24
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm18, %zmm24
; AVX512-FCP-NEXT: movb $24, %r11b
; AVX512-FCP-NEXT: kmovw %r11d, %k2
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5]
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
-; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5]
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [10,3,10,3,10,3,10,3]
+; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm14
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
-; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14
+; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm14
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,5,6,9,0,5,6,9]
+; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm14
; AVX512-FCP-NEXT: movb $-32, %r11b
; AVX512-FCP-NEXT: kmovw %r11d, %k1
-; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
-; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm18, %zmm14
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5]
-; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16
-; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm17, %zmm16
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1}
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
-; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm17
-; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm17
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
-; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm16
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16
-; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2}
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
-; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm14
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm14
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm3
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm0[4,5,4,5]
+; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm2, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6]
+; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm22, %zmm5
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [9,0,7,0,9,0,7,0]
+; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm16
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm16
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2}
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,4,11,4,11,4,11,4]
+; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm17
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm17
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
-; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17
+; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm17
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10]
+; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm17
; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm22
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22
-; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm17
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17
-; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
-; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm19
-; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm19
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1}
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm21
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm21
+; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm17
+; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm17
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm2, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm20, %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1}
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
+; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm21
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm21
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
-; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm21
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
+; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm21
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,5,6,11,0,5,6,11]
; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm19
-; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm19
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19
-; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1}
-; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22
-; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm22
-; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm3, %zmm25
+; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm21
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [14,0,0,7,14,0,0,7]
+; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm20
+; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm26, %zmm20
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm27 = [2,5]
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm11
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm4[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vpermt2q %ymm11, %ymm27, %ymm4
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm20, %zmm20
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1}
+; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm2, %zmm23
+; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm25, %zmm23
; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21
-; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
-; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
-; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm22
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm4
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm22
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1}
-; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24
-; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm24
-; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm26, %zmm25
+; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm26
; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm4
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm11
+; AVX512-FCP-NEXT: vpermi2q %ymm4, %ymm11, %ymm27
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm21
+; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm25
+; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm25
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,5,6,12,0,5,6,12]
+; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm25
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [7,0,9,0,7,0,9,0]
+; AVX512-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm23
+; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm23
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm23, %zmm23
+; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm23 {%k1}
+; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm2, %zmm22
+; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm22
+; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm27
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm26
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm25, %zmm23
-; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1}
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm27, %zmm25
+; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 {%k1}
; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4
; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm4
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13]
-; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm4
-; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,13,4,5,6,13]
+; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm4
+; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm12[6,7]
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
-; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24
-; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18
-; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm18
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm22
+; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm2, %zmm18
+; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm18
; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4
-; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm3
+; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm10
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512-FCP-NEXT: vpermi2q %zmm28, %zmm7, %zmm3
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18
; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm4
-; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm4
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
-; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm4
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm4
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14]
+; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm4
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm12
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12]
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7
-; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4
-; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20
-; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm20
-; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm7
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm12
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm6, %zmm19
+; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm19
+; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm5
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm10[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vpermi2q %zmm28, %zmm0, %zmm12
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm19, %zmm5
; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9]
; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm8
@@ -4046,28 +4074,28 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm9
+; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm9
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm2
+; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm2
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm6
-; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm0
; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm31, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rsi)
; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rdx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r9)
; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%r10)
; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
@@ -4266,153 +4294,158 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm30
+; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm31
; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26
-; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27
-; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28
+; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3
; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13
; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15
; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm28
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm24
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm18, %zmm24
; AVX512DQ-FCP-NEXT: movb $24, %r11b
; AVX512DQ-FCP-NEXT: kmovw %r11d, %k2
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm14
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
-; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm14
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm14
; AVX512DQ-FCP-NEXT: movb $-32, %r11b
; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm18, %zmm14
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm17, %zmm16
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1}
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm17
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm17
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm16
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2}
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm24 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm0[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm22, %zmm5
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm16
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm16
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2}
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm17
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm17
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm17
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm17
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm22
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm17
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm19
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1}
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm21
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm21
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm17
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm19, %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm20, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1}
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm21
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm21
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm21
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm21
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,5,6,11,0,5,6,11]
; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm19
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1}
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm22
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm3, %zmm25
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm21
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm20
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm26, %zmm20
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm27 = [2,5]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm4[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vpermt2q %ymm11, %ymm27, %ymm4
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm20, %zmm20
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1}
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm2, %zmm23
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm25, %zmm23
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
-; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm22
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1}
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm24
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm26, %zmm25
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm3, %zmm26
; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vpermi2q %ymm4, %ymm11, %ymm27
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm21
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm25
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm25
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm25
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm23
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm27, %zmm23
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm23, %zmm23
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm23 {%k1}
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm2, %zmm22
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm22
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm27
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm26
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm25, %zmm23
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1}
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm27, %zmm25
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4
; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm4
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm4
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm4
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm12[6,7]
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm18
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm3, %zmm10
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm22
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm2, %zmm18
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm18
; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm10
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm28, %zmm7, %zmm3
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm18
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm4
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm4
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm4
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12]
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm20
-; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm12
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm6, %zmm19
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm19
+; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm10[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm28, %zmm0, %zmm12
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm19, %zmm5
; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9]
; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm8
@@ -4420,28 +4453,28 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm9
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm9
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm2
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm2
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm6
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm2, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 64(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r9)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%r10)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
@@ -4640,186 +4673,189 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3
+; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm12
; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6
-; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26
+; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm0
; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
-; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28
-; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
+; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7
+; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm11
; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13
; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15
; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8
-; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
+; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm24
; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9
-; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31
-; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm18, %zmm31
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3
+; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm18, %zmm3
; AVX512BW-FCP-NEXT: movb $24, %r11b
; AVX512BW-FCP-NEXT: kmovd %r11d, %k2
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5]
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
-; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm9[4,5,4,5],zmm24[4,5,4,5]
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [10,3,10,3,10,3,10,3]
+; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
-; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14
+; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm14
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,5,6,9,0,5,6,9]
+; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm14
; AVX512BW-FCP-NEXT: movb $-32, %r11b
; AVX512BW-FCP-NEXT: kmovd %r11d, %k1
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
-; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm14
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm16
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1}
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
-; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm17
-; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm24, %zmm17
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
-; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm16
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2}
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
-; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm14
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm0[4,5,4,5]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6]
+; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm22, %zmm5
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [9,0,7,0,9,0,7,0]
+; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm16
+; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm16
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2}
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,4,11,4,11,4,11,4]
+; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm17
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
-; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17
+; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm17
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10]
+; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm17
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22
-; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm17
-; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
-; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm19
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm19
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1}
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm21
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm21
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17
+; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm17
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1}
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
+; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm21
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm21
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
-; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm21
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm21
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,5,6,11,0,5,6,11]
; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19
-; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm25, %zmm19
-; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1}
-; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm22
-; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm4, %zmm25
+; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm21
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [14,0,0,7,14,0,0,7]
+; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm20
+; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm26, %zmm20
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm27 = [2,5]
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm29
+; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm30 = ymm28[8,9,10,11,12,13,14,15],ymm29[0,1,2,3,4,5,6,7],ymm28[24,25,26,27,28,29,30,31],ymm29[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpermt2q %ymm29, %ymm27, %ymm28
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm20, %zmm20
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1}
+; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm1, %zmm23
+; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm23
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
-; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
-; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22
-; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22
-; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm27, %xmm27
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm22, %zmm22
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1}
-; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm24
-; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm26, %zmm25
-; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm5
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm23
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm5
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13]
-; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm26
+; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm25
+; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %ymm28
+; AVX512BW-FCP-NEXT: vpermi2q %ymm25, %ymm28, %ymm27
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm21
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm26
+; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm26
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,5,6,12,0,5,6,12]
+; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm26
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0]
+; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23
+; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm23
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm30, %xmm30
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm23, %zmm23
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm23 {%k1}
+; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm1, %zmm22
+; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm27, %zmm22
+; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm29
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm25 = ymm28[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],ymm28[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm25, %xmm25
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm29, %zmm25
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22
+; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm22
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [4,5,6,13,4,5,6,13]
+; AVX512BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm22
; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4
-; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
-; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm10
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm24
-; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm18
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm1, %zmm18
+; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm18
; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4
-; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm5
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10
-; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm10
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14]
-; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10
-; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7
-; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7
-; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm20
-; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm10
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm4
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm28
+; AVX512BW-FCP-NEXT: vpermi2q %zmm7, %zmm11, %zmm0
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm27
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm4
+; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm4
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [4,5,6,14,4,5,6,14]
+; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm26[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm26[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm7
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm6, %zmm19
+; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm19
+; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm5
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm10[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm11, %zmm0
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9]
; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm8
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
-; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm9
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,5,8,15,4,5,8,15]
+; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm8
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,4,11]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm9
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [6,13]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm13, %zmm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm1
+; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm5
+; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm13, %zmm11
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, (%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rsi)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rdx)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r10)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%r10)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%r9)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r10)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -5014,186 +5050,189 @@ define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm12
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm11
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm24
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm18, %zmm31
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm18, %zmm3
; AVX512DQ-BW-FCP-NEXT: movb $24, %r11b
; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k2} = zmm9[4,5,4,5],zmm24[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
-; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm14
; AVX512DQ-BW-FCP-NEXT: movb $-32, %r11b
; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm16
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm24, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm16
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm18, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm0[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm16, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm22, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm19, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm16 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm20, %zmm17
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm19
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm19
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm22, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm19, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm17 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm20, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm17 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,5,6,11,0,5,6,11]
; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm25, %zmm19
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm22
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm4, %zmm25
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm20
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm26, %zmm20
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm27 = [2,5]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm28
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm29
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm30 = ymm28[8,9,10,11,12,13,14,15],ymm29[0,1,2,3,4,5,6,7],ymm28[24,25,26,27,28,29,30,31],ymm29[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm29, %ymm27, %ymm28
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm28, %zmm20, %zmm20
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm1, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm23
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
-; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm27, %xmm27
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm22, %zmm22
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm24
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm26, %zmm25
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm23
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm26
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %ymm25
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %ymm28
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm25, %ymm28, %ymm27
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm26, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm21 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm26
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm26
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm27 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm26
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm29 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm29, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm30, %xmm30
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm30, %zmm23, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm23 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm1, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm27, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm0, %zmm29
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm30
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm25 = ymm28[8,9,10,11,12,13,14,15],ymm25[0,1,2,3,4,5,6,7],ymm28[24,25,26,27,28,29,30,31],ymm25[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm25, %xmm25
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm25, %zmm29, %zmm25
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-BW-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm26, %zmm22
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm10
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm24
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm18
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm22, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm1, %zmm18
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm26, %zmm18
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm20
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm28
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm7, %zmm11, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm27
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm18, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm26[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm26[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm6, %zmm19
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm18, %zmm19
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm10[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm11, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9]
; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
-; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [4,5,8,15,4,5,8,15]
+; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm7, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm10 = [0,0,4,11]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm10, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [6,13]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm13, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm13, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rsi)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rdx)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r10)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%r10)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 64(%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r10)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <112 x i64>, ptr %in.vec, align 64
@@ -5776,411 +5815,412 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: subq $1736, %rsp # imm = 0x6C8
; AVX-NEXT: vmovaps 1216(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 768(%rdi), %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 320(%rdi), %ymm7
+; AVX-NEXT: vmovaps 768(%rdi), %ymm4
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 320(%rdi), %ymm10
; AVX-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm0[6,7]
; AVX-NEXT: vmovaps 224(%rdi), %xmm5
-; AVX-NEXT: vmovaps 272(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovaps 272(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6,7]
; AVX-NEXT: vmovaps 672(%rdi), %xmm6
-; AVX-NEXT: vmovaps 720(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovaps 720(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX-NEXT: vmovaps 1120(%rdi), %xmm11
-; AVX-NEXT: vmovaps 1168(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovaps 1168(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1664(%rdi), %ymm14
+; AVX-NEXT: vmovaps 1664(%rdi), %ymm3
; AVX-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm0[6,7]
; AVX-NEXT: vmovaps 1568(%rdi), %xmm8
-; AVX-NEXT: vmovaps 1616(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovaps 1616(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm0[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm2[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 96(%rdi), %ymm4
; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3]
-; AVX-NEXT: vmovapd 48(%rdi), %xmm1
-; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm1[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1,2],ymm0[3]
+; AVX-NEXT: vmovapd 48(%rdi), %xmm0
+; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} xmm7 = mem[0],xmm0[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm2[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 544(%rdi), %ymm9
; AVX-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3]
-; AVX-NEXT: vmovapd 448(%rdi), %xmm10
-; AVX-NEXT: vmovapd 496(%rdi), %xmm1
-; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm10[0],xmm1[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1,2],ymm0[3]
+; AVX-NEXT: vmovapd 448(%rdi), %xmm13
+; AVX-NEXT: vmovapd 496(%rdi), %xmm0
+; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm13[0],xmm0[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm2[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 992(%rdi), %ymm15
; AVX-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm15[0,1,2],ymm0[3]
; AVX-NEXT: vmovapd 896(%rdi), %xmm12
-; AVX-NEXT: vmovapd 944(%rdi), %xmm1
-; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm12[0],xmm1[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; AVX-NEXT: vmovapd 944(%rdi), %xmm0
+; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm12[0],xmm0[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm2[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1440(%rdi), %ymm2
+; AVX-NEXT: vmovapd 1440(%rdi), %ymm14
; AVX-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3]
-; AVX-NEXT: vmovapd 1344(%rdi), %xmm3
+; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm14[0,1,2],ymm0[3]
+; AVX-NEXT: vmovapd 1344(%rdi), %xmm2
; AVX-NEXT: vmovapd 1392(%rdi), %xmm0
; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} xmm13 = xmm3[0],xmm0[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2,3]
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa 288(%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 384(%rdi), %ymm0
-; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[1],ymm7[3],ymm0[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3]
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 736(%rdi), %xmm5
-; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 384(%rdi), %ymm5
+; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm10[0],ymm5[1],ymm10[3],ymm5[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm7[2,3]
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 736(%rdi), %xmm0
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX-NEXT: vmovapd 832(%rdi), %ymm7
; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[3],ymm7[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3]
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 1184(%rdi), %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 1280(%rdi), %ymm6
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[3],ymm6[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3]
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 1632(%rdi), %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 1728(%rdi), %ymm11
-; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm14[0],ymm11[1],ymm14[3],ymm11[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3]
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 160(%rdi), %ymm8
-; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[3],ymm8[2]
-; AVX-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 608(%rdi), %ymm14
-; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm9[0],ymm14[1],ymm9[3],ymm14[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1184(%rdi), %xmm0
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 1280(%rdi), %ymm1
+; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[3],ymm1[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3]
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1632(%rdi), %xmm0
+; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 1728(%rdi), %ymm0
+; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[3],ymm0[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 160(%rdi), %ymm10
+; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm10[1],ymm4[3],ymm10[2]
+; AVX-NEXT: vmovdqa 64(%rdi), %xmm11
+; AVX-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 608(%rdi), %ymm3
+; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm9[0],ymm3[1],ymm9[3],ymm3[2]
; AVX-NEXT: vmovdqa 512(%rdi), %xmm4
-; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3]
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1056(%rdi), %ymm13
-; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm15[0],ymm13[1],ymm15[3],ymm13[2]
-; AVX-NEXT: vmovdqa 960(%rdi), %xmm15
-; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3]
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1504(%rdi), %ymm5
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[3],ymm5[2]
+; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm13[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1],ymm6[2,3]
+; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 1056(%rdi), %ymm8
+; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm15[0],ymm8[1],ymm15[3],ymm8[2]
+; AVX-NEXT: vmovdqa 960(%rdi), %xmm13
+; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3]
+; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 1504(%rdi), %ymm12
+; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm14[0],ymm12[1],ymm14[3],ymm12[2]
; AVX-NEXT: vmovdqa 1408(%rdi), %xmm9
-; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 352(%rdi), %xmm3
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3]
-; AVX-NEXT: vmovapd 240(%rdi), %xmm2
-; AVX-NEXT: vblendpd {{.*#+}} xmm10 = xmm2[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 800(%rdi), %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm10
-; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3]
-; AVX-NEXT: vmovapd 688(%rdi), %xmm10
-; AVX-NEXT: vblendpd {{.*#+}} xmm12 = xmm10[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3]
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1248(%rdi), %xmm7
-; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm12
-; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm6[3]
-; AVX-NEXT: vmovapd 1136(%rdi), %xmm6
-; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm6[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3]
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1696(%rdi), %xmm7
-; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3]
-; AVX-NEXT: vmovapd 1584(%rdi), %xmm11
-; AVX-NEXT: vblendpd {{.*#+}} xmm12 = xmm11[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3]
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 128(%rdi), %xmm12
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm7
-; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3]
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm8
-; AVX-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3]
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 576(%rdi), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3]
-; AVX-NEXT: vmovdqa 464(%rdi), %xmm8
-; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3]
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 1024(%rdi), %xmm14
-; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm13[3]
-; AVX-NEXT: vmovdqa 912(%rdi), %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3],xmm15[4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3]
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 1472(%rdi), %xmm4
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3]
-; AVX-NEXT: vmovdqa 1360(%rdi), %xmm15
-; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3],xmm9[4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3]
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 304(%rdi), %xmm9
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm9[0],ymm2[2],ymm9[3]
-; AVX-NEXT: vmovdqa 416(%rdi), %xmm13
-; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
+; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3]
; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 752(%rdi), %xmm5
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm10[1],ymm5[0],ymm10[2],ymm5[3]
-; AVX-NEXT: vmovdqa 864(%rdi), %xmm10
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1200(%rdi), %xmm0
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[2],ymm0[3]
-; AVX-NEXT: vmovdqa 1312(%rdi), %xmm6
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovaps 352(%rdi), %xmm2
+; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1648(%rdi), %xmm0
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[2],ymm0[3]
-; AVX-NEXT: vmovdqa 1760(%rdi), %xmm2
-; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm5[3]
+; AVX-NEXT: vmovapd 240(%rdi), %xmm5
+; AVX-NEXT: vblendpd {{.*#+}} xmm6 = xmm5[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3]
+; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 800(%rdi), %xmm2
+; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3]
+; AVX-NEXT: vmovapd 688(%rdi), %xmm7
+; AVX-NEXT: vblendpd {{.*#+}} xmm9 = xmm7[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3]
+; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1248(%rdi), %xmm9
+; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm2
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3]
+; AVX-NEXT: vmovapd 1136(%rdi), %xmm14
+; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm14[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
+; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1696(%rdi), %xmm15
+; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
+; AVX-NEXT: vmovapd 1584(%rdi), %xmm2
+; AVX-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1424(%rdi), %xmm0
+; AVX-NEXT: vmovaps 128(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3]
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[3]
-; AVX-NEXT: vmovdqa 1536(%rdi), %xmm11
-; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
+; AVX-NEXT: vmovdqa 576(%rdi), %xmm10
+; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3]
+; AVX-NEXT: vmovdqa 464(%rdi), %xmm11
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1,2,3],xmm4[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 976(%rdi), %xmm15
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm15[0],ymm1[2],ymm15[3]
-; AVX-NEXT: vmovdqa 1088(%rdi), %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
+; AVX-NEXT: vmovdqa 1024(%rdi), %xmm6
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3]
+; AVX-NEXT: vmovdqa 912(%rdi), %xmm3
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm13[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 528(%rdi), %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[1],ymm0[0],ymm8[2],ymm0[3]
-; AVX-NEXT: vmovdqa 640(%rdi), %xmm4
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 80(%rdi), %xmm3
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[3]
-; AVX-NEXT: vmovdqa 192(%rdi), %xmm7
-; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa 1472(%rdi), %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4
+; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1,2],ymm12[3]
+; AVX-NEXT: vmovapd 1360(%rdi), %xmm4
+; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm12 # 16-byte Folded Reload
+; AVX-NEXT: # xmm12 = xmm4[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3]
+; AVX-NEXT: vmovupd %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 416(%rdi), %xmm13
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 304(%rdi), %xmm12
+; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7]
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 864(%rdi), %xmm8
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX-NEXT: vmovdqa 752(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1312(%rdi), %xmm5
+; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm9[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX-NEXT: vmovapd 128(%rdi), %ymm2
-; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3]
-; AVX-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3]
+; AVX-NEXT: vmovdqa 1200(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm14[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
+; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1760(%rdi), %xmm7
+; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm15[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
+; AVX-NEXT: vmovdqa 1648(%rdi), %xmm9
+; AVX-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1536(%rdi), %xmm9
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT: vmovdqa 1424(%rdi), %xmm7
+; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1088(%rdi), %xmm0
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
+; AVX-NEXT: vmovdqa 976(%rdi), %xmm1
+; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 640(%rdi), %xmm2
+; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT: vmovdqa 528(%rdi), %xmm4
+; AVX-NEXT: vpalignr {{.*#+}} xmm10 = xmm11[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 192(%rdi), %xmm3
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
+; AVX-NEXT: vmovdqa 80(%rdi), %xmm14
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
+; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT: vmovapd 128(%rdi), %ymm11
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm11[0,1,2],ymm3[3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm10 = mem[0,1,2,3],xmm14[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3]
; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3
-; AVX-NEXT: vmovapd 352(%rdi), %ymm7
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm3[3]
-; AVX-NEXT: vmovapd 256(%rdi), %xmm3
-; AVX-NEXT: vblendpd {{.*#+}} xmm8 = xmm3[0],xmm9[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3]
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT: vmovapd 576(%rdi), %ymm7
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3]
-; AVX-NEXT: vmovapd 480(%rdi), %xmm7
-; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3]
+; AVX-NEXT: vmovapd 352(%rdi), %ymm10
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3]
+; AVX-NEXT: vmovdqa 256(%rdi), %xmm6
+; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3],xmm12[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm12[0,1],ymm3[2,3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT: vmovaps 576(%rdi), %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX-NEXT: vmovdqa 480(%rdi), %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm3
+; AVX-NEXT: vmovapd 800(%rdi), %ymm8
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1,2],ymm3[3]
+; AVX-NEXT: vmovapd 704(%rdi), %xmm15
+; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = xmm15[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT: vmovaps 1024(%rdi), %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovdqa 928(%rdi), %xmm13
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1,2,3],xmm1[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX-NEXT: vmovapd 1248(%rdi), %ymm14
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3]
+; AVX-NEXT: vmovapd 1152(%rdi), %xmm6
+; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm6[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4
-; AVX-NEXT: vmovapd 800(%rdi), %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3]
-; AVX-NEXT: vmovapd 704(%rdi), %xmm14
-; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm14[0],xmm5[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vmovapd 1024(%rdi), %ymm4
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3]
-; AVX-NEXT: vmovapd 928(%rdi), %xmm9
-; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm9[0],xmm15[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
+; AVX-NEXT: vmovaps 1472(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovdqa 1376(%rdi), %xmm0
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm7[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
+; AVX-NEXT: vmovapd 1696(%rdi), %ymm5
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3]
+; AVX-NEXT: vmovapd 1600(%rdi), %xmm7
+; AVX-NEXT: vblendpd $2, (%rsp), %xmm7, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = xmm7[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm4
-; AVX-NEXT: vmovapd 1248(%rdi), %ymm1
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3]
-; AVX-NEXT: vmovapd 1152(%rdi), %xmm15
-; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm15[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm4
-; AVX-NEXT: vmovaps 1472(%rdi), %ymm5
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX-NEXT: vmovaps 1376(%rdi), %xmm8
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm8[0,1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
-; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm4 # 16-byte Folded Reload
-; AVX-NEXT: vmovapd 1696(%rdi), %ymm12
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3]
-; AVX-NEXT: vmovapd 1600(%rdi), %xmm13
-; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm13[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 192(%rdi), %ymm11
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[3],ymm11[2]
-; AVX-NEXT: vmovdqa 96(%rdi), %xmm4
-; AVX-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 416(%rdi), %ymm10
+; AVX-NEXT: vmovapd 192(%rdi), %ymm9
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm11[0],ymm9[1],ymm11[3],ymm9[2]
+; AVX-NEXT: vmovdqa 96(%rdi), %xmm3
+; AVX-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
+; AVX-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill
+; AVX-NEXT: vmovapd 416(%rdi), %ymm11
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm10[0],ymm11[1],ymm10[3],ymm11[2]
+; AVX-NEXT: vmovdqa 320(%rdi), %xmm12
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
+; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 544(%rdi), %xmm1
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 640(%rdi), %ymm4
; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[3],ymm10[2]
-; AVX-NEXT: vmovdqa 320(%rdi), %xmm6
-; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
-; AVX-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill
-; AVX-NEXT: vmovdqa 544(%rdi), %xmm2
-; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 640(%rdi), %ymm5
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[3],ymm5[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 864(%rdi), %ymm7
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[3],ymm7[2]
-; AVX-NEXT: vmovdqa 768(%rdi), %xmm4
-; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 992(%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[3],ymm4[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 864(%rdi), %ymm10
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm8[0],ymm10[1],ymm8[3],ymm10[2]
+; AVX-NEXT: vmovdqa 768(%rdi), %xmm8
+; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm15[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
+; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 992(%rdi), %xmm1
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX-NEXT: vmovapd 1088(%rdi), %ymm3
; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[3],ymm3[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1312(%rdi), %ymm9
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm9[1],ymm1[3],ymm9[2]
-; AVX-NEXT: vmovdqa 1216(%rdi), %xmm2
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT: vmovdqa 1440(%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 1536(%rdi), %ymm8
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[3],ymm8[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1760(%rdi), %ymm1
-; AVX-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[1],ymm12[3],ymm1[2]
-; AVX-NEXT: vmovdqa 1664(%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3]
-; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX-NEXT: # ymm11 = mem[0,1,2],ymm11[3]
-; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT: vblendpd {{.*#+}} xmm13 = xmm13[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm13[0,1],ymm11[2,3]
-; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX-NEXT: # ymm10 = mem[0,1,2],ymm10[3]
-; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1],ymm10[2,3]
-; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT: # ymm5 = mem[0,1,2],ymm5[3]
-; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3]
-; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT: # ymm5 = mem[0,1,2],ymm7[3]
-; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm4[0,1],ymm5[2,3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 1312(%rdi), %ymm2
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm14[0],ymm2[1],ymm14[3],ymm2[2]
+; AVX-NEXT: vmovdqa 1216(%rdi), %xmm14
+; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm6[0,1],ymm1[2,3]
+; AVX-NEXT: vmovdqa 1440(%rdi), %xmm1
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 1536(%rdi), %ymm1
+; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[3],ymm1[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3]
+; AVX-NEXT: vmovapd 1760(%rdi), %ymm0
+; AVX-NEXT: vshufpd {{.*#+}} ymm15 = ymm5[0],ymm0[1],ymm5[3],ymm0[2]
+; AVX-NEXT: vmovdqa 1664(%rdi), %xmm5
+; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3]
+; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX-NEXT: # ymm9 = mem[0,1,2],ymm9[3]
+; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX-NEXT: vblendpd {{.*#+}} xmm15 = xmm15[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm15[0,1],ymm9[2,3]
+; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload
+; AVX-NEXT: # ymm9 = mem[0,1,2],ymm11[3]
+; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = mem[0,1,2,3],xmm12[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm11[0,1],ymm9[2,3]
+; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX-NEXT: # ymm4 = mem[0,1,2],ymm4[3]
+; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT: vblendpd {{.*#+}} xmm9 = xmm9[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3]
+; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload
+; AVX-NEXT: # ymm9 = mem[0,1,2],ymm10[3]
+; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = mem[0,1,2,3],xmm8[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm8[0,1],ymm9[2,3]
; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
; AVX-NEXT: # ymm3 = mem[0,1,2],ymm3[3]
-; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
-; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
-; AVX-NEXT: # ymm4 = mem[0,1,2],ymm9[3]
-; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1],ymm4[2,3]
-; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT: # ymm2 = mem[0,1,2],ymm8[3]
-; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3]
+; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vblendpd {{.*#+}} xmm8 = xmm8[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm8[0,1],ymm3[2,3]
+; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
+; AVX-NEXT: # ymm2 = mem[0,1,2],ymm2[3]
+; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = mem[0,1,2,3],xmm14[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX-NEXT: # ymm1 = mem[0,1,2],ymm1[3]
-; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3]
+; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm3[0,1],ymm1[2,3]
+; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT: # ymm0 = mem[0,1,2],ymm0[3]
+; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = mem[0,1,2,3],xmm5[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, 192(%rsi)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -6262,29 +6302,28 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, (%r9)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovapd %ymm12, 224(%rax)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm0, 192(%rax)
-; AVX-NEXT: vmovapd %ymm15, 160(%rax)
+; AVX-NEXT: vmovapd %ymm7, 224(%rax)
+; AVX-NEXT: vmovapd %ymm6, 192(%rax)
+; AVX-NEXT: vmovapd %ymm13, 160(%rax)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, 128(%rax)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, 96(%rax)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, 64(%rax)
-; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm0, 32(%rax)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm0, 32(%rax)
+; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, (%rax)
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovapd %ymm10, 224(%rax)
-; AVX-NEXT: vmovapd %ymm4, 192(%rax)
-; AVX-NEXT: vmovapd %ymm7, 160(%rax)
-; AVX-NEXT: vmovapd %ymm3, 128(%rax)
-; AVX-NEXT: vmovapd %ymm5, 96(%rax)
-; AVX-NEXT: vmovapd %ymm6, 64(%rax)
-; AVX-NEXT: vmovapd %ymm11, 32(%rax)
-; AVX-NEXT: vmovapd %ymm14, (%rax)
+; AVX-NEXT: vmovapd %ymm9, 224(%rax)
+; AVX-NEXT: vmovapd %ymm8, 192(%rax)
+; AVX-NEXT: vmovapd %ymm2, 160(%rax)
+; AVX-NEXT: vmovapd %ymm10, 128(%rax)
+; AVX-NEXT: vmovapd %ymm11, 96(%rax)
+; AVX-NEXT: vmovapd %ymm4, 64(%rax)
+; AVX-NEXT: vmovapd %ymm12, 32(%rax)
+; AVX-NEXT: vmovapd %ymm15, (%rax)
; AVX-NEXT: addq $1736, %rsp # imm = 0x6C8
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -8254,465 +8293,449 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-FCP-LABEL: load_i64_stride7_vf32:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $2728, %rsp # imm = 0xAA8
-; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm25
-; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1
-; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31
-; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm12
-; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20
-; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14
-; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
-; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
+; AVX512-FCP-NEXT: subq $3048, %rsp # imm = 0xBE8
+; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18
+; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm17
+; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm21
+; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm11
+; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9
+; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8
+; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10
+; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm22
+; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12
+; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm28
; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm19
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm6
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0]
-; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
-; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
-; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm25
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm15
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa 896(%rdi), %ymm2
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %ymm1, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %ymm1, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm2
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
+; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %ymm2, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 1408(%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa 1344(%rdi), %ymm2
+; AVX512-FCP-NEXT: vpermi2q %ymm1, %ymm2, %ymm3
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm26, %zmm1
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
+; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm1
+; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm3
+; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm6
+; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm5
+; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm6
+; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm7
+; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm6
+; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm6
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6
+; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm26, %zmm6
+; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23
+; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm6
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm26, %zmm15
+; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm26, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm6
+; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm26, %zmm6
; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
-; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
-; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4]
-; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm30, %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm7
+; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm9
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm26, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm8
+; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm29
+; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm29, %zmm26
+; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm16
+; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm26
+; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm4
+; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm6
+; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm9, %zmm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm3
; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm3
+; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm3
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm7
+; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [9,0,7,0,9,0,7,0]
+; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm14
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm4
+; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm15, %zmm4
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
-; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm20
-; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm21
-; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm21
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm19
-; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm30, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12
-; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm12
-; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm28
-; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm4
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm10
-; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm8
-; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm18
-; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6
-; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5
-; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm6
-; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm31
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm6
-; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm4
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm17
-; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm15
-; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
-; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm4
-; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm26
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
-; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm4
-; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm9, %zmm23
-; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm11
-; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4
-; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4
-; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm27
-; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm27
-; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm6
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29
-; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm29
-; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm22
-; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm14
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22
-; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm22
-; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm25
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm25
-; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm30
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm24
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm24
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm2, %zmm9
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm7
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm10
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4
-; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9]
-; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5
-; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm12
+; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm2
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm9
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8
-; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10]
-; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm17
-; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6
-; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11]
-; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm29
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm26
+; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [10,3,10,3,10,3,10,3]
+; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm20
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,4,11,4,11,4,11,4]
+; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm17
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm17
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [12,5,12,5,12,5,12,5]
+; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm14
-; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12]
-; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm22
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [13,6,13,6,13,6,13,6]
+; AVX512-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm13
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10
+; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm13
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm4
+; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm4
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm11
+; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm9, %zmm11
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm18
+; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm18
+; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm13
; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13]
-; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm25
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm8
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm14
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm30
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14]
-; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm24
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm21
-; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm7
-; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15]
-; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm19
-; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm10
-; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm28
+; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm28
+; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm29, %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm24
+; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13
+; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm19
+; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm27
+; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm10
+; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm26
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12
+; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm25 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6
+; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm7
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm22
+; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm22
+; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm30 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm29, %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm14
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm14
+; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm29, %zmm9
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm22
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm3
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm3
+; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm29, %zmm31
+; AVX512-FCP-NEXT: vpermi2q %zmm29, %zmm8, %zmm15
+; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm29
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm21
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9]
+; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm4
; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: movb $24, %al
-; AVX512-FCP-NEXT: kmovw %eax, %k1
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm20
+; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0]
-; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm20
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm20
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm28
+; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm24
+; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,10,0,5,6,10]
+; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm19
+; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm17
+; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm8
+; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm2
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,11,0,5,6,11]
+; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm14
+; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm2
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm11
+; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm22
+; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,12,0,5,6,12]
+; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5]
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
-; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm19
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm19
-; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm3
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3
; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm18
+; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm3, %zmm11
+; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm15
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,8,15,4,5,8,15]
+; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm17
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm18
+; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm29
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm21
+; AVX512-FCP-NEXT: movb $24, %al
+; AVX512-FCP-NEXT: kmovw %eax, %k1
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm10[4,5,4,5],zmm27[4,5,4,5]
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [14,0,0,7,14,0,0,7]
+; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm23
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm16, %zmm23
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [7,0,9,0,7,0,9,0]
+; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm9, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm10
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm12[4,5,4,5],zmm5[4,5,4,5]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm22
+; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm22
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm24
+; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm9, %zmm24
+; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm12
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm7[4,5,4,5],zmm6[4,5,4,5]
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16
-; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm16
-; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm3, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm17
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm17
-; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm4
-; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
-; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 912(%rdi), %xmm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27
+; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm27
+; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm16
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm20
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm20
+; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm9
+; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm7
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5]
+; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21
-; AVX512-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16
-; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
+; AVX512-FCP-NEXT: vinserti64x4 $0, (%rsp), %zmm4, %zmm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm0
+; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [6,13]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11
-; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12]
+; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm6
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm15
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
-; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
-; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm6, %zmm4
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm0
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
-; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13]
-; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm6
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm15
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
-; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm12
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3
-; AVX512-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm0
+; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm5
+; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm1 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1
; AVX512-FCP-NEXT: movb $-32, %al
; AVX512-FCP-NEXT: kmovw %eax, %k2
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2}
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17
-; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2}
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1}
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2}
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm14
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm2
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vmovdqa 1408(%rdi), %ymm15
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k2}
+; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k2}
+; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k2}
+; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm4 # 16-byte Folded Reload
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2}
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 16-byte Folded Reload
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
-; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2
-; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm6, %zmm11
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm6, %zmm8
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7
-; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 64(%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm28, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 192(%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2}
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm20, %zmm11
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2}
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm2, %zmm13
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k2}
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm12, %xmm16
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm12, %xmm17
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm16 {%k2}
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm9, %zmm9
+; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm9 {%k2}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 192(%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 192(%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm25, (%rdx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 128(%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 192(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 128(%r8)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm4, 192(%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 128(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 192(%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%r8)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%r9)
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm4, (%r9)
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
@@ -8720,16 +8743,20 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm4, 128(%r9)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovaps %zmm7, 128(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax)
-; AVX512-FCP-NEXT: vmovaps %zmm8, (%rax)
-; AVX512-FCP-NEXT: vmovaps %zmm11, 64(%rax)
-; AVX512-FCP-NEXT: addq $2728, %rsp # imm = 0xAA8
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm15, (%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax)
+; AVX512-FCP-NEXT: addq $3048, %rsp # imm = 0xBE8
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -9216,465 +9243,449 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-FCP-LABEL: load_i64_stride7_vf32:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $2728, %rsp # imm = 0xAA8
-; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm25
-; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31
-; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20
-; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
+; AVX512DQ-FCP-NEXT: subq $3048, %rsp # imm = 0xBE8
+; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm18
+; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm21
+; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm8
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm22
+; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm28
; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm6
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0]
-; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm6
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
-; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm20
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm21
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm21
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm30, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm28
-; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm25
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm15
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm26 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa 896(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %ymm1, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %ymm1, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %ymm2, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 1408(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa 1344(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vpermi2q %ymm1, %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm26, %zmm1
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm5
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm7
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm3, %zmm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm6
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm3, %zmm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm6
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm26, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm23
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm6
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm26, %zmm15
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm26, %zmm5
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm10
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm18
-; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm26, %zmm6
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm26, %zmm6
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm31
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm6
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm17
-; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm26
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm9, %zmm23
-; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm27
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm27
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm29
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm22
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm22
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm25
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm25
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm30
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm24
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm24
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm2, %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm7
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9]
-; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm8
+; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm29
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm29, %zmm26
+; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm16
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm26
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm9, %zmm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm26, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm15, %zmm4
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm4
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm17
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm29
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm26
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm20
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm20
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm2 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm17
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm17
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm14
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm15, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm13
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm11
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm9, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm18
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm18
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm15, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm13
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm25
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm14
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm30
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm24
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm21
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15]
-; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm10
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm28
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm5, %zmm28
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm29, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm24
+; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm15, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm19
+; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm27
+; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm31, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm15, %zmm26
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm6
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm31, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm25 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm22
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm31, %zmm22
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm30 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm29, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm14
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm14
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm29, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm22
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm3
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm31, %zmm3
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm29, %zmm31
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm29, %zmm8, %zmm15
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm29
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm21
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm4
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: movb $24, %al
-; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm20
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0]
-; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm20
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm20
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm28
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm24
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm19
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm8
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm14
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm22
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm19
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm19
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm3
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm3
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm18
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm3, %zmm11
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm15
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,8,15,4,5,8,15]
+; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm18
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm29
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm21
+; AVX512DQ-FCP-NEXT: movb $24, %al
+; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm10[4,5,4,5],zmm27[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm23
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm16, %zmm23
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm9, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm12[4,5,4,5],zmm5[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm22
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm16, %zmm22
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm24
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm9, %zmm24
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm12
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm7[4,5,4,5],zmm6[4,5,4,5]
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm16
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm17
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm17
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 912(%rdi), %xmm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm27
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm7, %zmm16
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm20
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm20
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm9
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21
-; AVX512DQ-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, (%rsp), %zmm4, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm8 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [6,13]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm15
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm15
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm5
+; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm1 = ymm2[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm1
; AVX512DQ-FCP-NEXT: movb $-32, %al
; AVX512DQ-FCP-NEXT: kmovw %eax, %k2
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2}
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2}
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2}
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vmovdqa 1408(%rdi), %ymm15
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm26 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm25 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm30 {%k2}
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm7 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7 {%k2}
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm8 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8 {%k2}
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm4 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2}
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm10 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm6, %zmm11
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm6, %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 64(%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 192(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm20, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm2, %zmm13
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm12, %xmm16
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm12, %xmm17
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm24, %zmm16
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm16 {%k2}
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm9, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm9 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 192(%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 192(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, (%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 128(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 192(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 128(%r8)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 192(%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 128(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 192(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%r8)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%r9)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm4, (%r9)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
@@ -9682,16 +9693,20 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 128(%r9)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovaps %zmm7, 128(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax)
-; AVX512DQ-FCP-NEXT: vmovaps %zmm8, (%rax)
-; AVX512DQ-FCP-NEXT: vmovaps %zmm11, 64(%rax)
-; AVX512DQ-FCP-NEXT: addq $2728, %rsp # imm = 0xAA8
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax)
+; AVX512DQ-FCP-NEXT: addq $3048, %rsp # imm = 0xBE8
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -10170,474 +10185,481 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512BW-FCP-LABEL: load_i64_stride7_vf32:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: subq $2760, %rsp # imm = 0xAC8
-; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20
-; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2
-; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm29
-; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4
-; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30
-; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28
-; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12
+; AVX512BW-FCP-NEXT: subq $2632, %rsp # imm = 0xA48
+; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm10
+; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm22
+; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm12
+; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2
+; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm30
+; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
-; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm17
-; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm16
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm15
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,7,14,0,0,7,14,0]
+; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm13, %zmm3
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
-; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
-; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6
-; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
-; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
-; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
-; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
-; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
-; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10
-; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm10
-; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm17
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
-; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm6
-; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm24
-; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm15
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm3
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [10,3,10,3,10,3,10,3]
+; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm14, %zmm3
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm16
-; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8
-; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13
-; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm11
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
+; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [9,0,7,0,9,0,7,0]
+; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4]
+; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm5
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm5
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm29
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4
-; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm4
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14
-; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm18
-; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm4
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm9, %zmm28
-; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19
-; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [12,5,12,5,12,5,12,5]
+; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm4
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm4
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm4
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12
-; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm4
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm4
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm4
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm9, %zmm22
-; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm7
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29
-; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm29
-; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm23
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31
-; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm31
-; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm11
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15
-; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm25, %zmm15
-; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm25
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm21
-; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm21
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4
-; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm1, %zmm9
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm16
-; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm4
-; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
-; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5
-; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm6
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm6
-; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm8
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
-; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm14
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17
+; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm17
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm16
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm13, %zmm2
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21
+; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm28
+; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm13, %zmm23
+; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2
+; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm18
+; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm14
+; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm20
+; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm7
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
-; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm23
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
-; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm31
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11
+; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm31
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm8, %zmm31
+; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm27
+; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm6, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm30 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8
+; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm26
+; AVX512BW-FCP-NEXT: vpermt2q (%rsp), %zmm4, %zmm26 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12
+; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm25
+; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm25
+; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19
+; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm19
+; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm24
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm7
+; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
+; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm6
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm10
+; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm13
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15
+; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm15
+; AVX512BW-FCP-NEXT: vpermi2q %zmm18, %zmm2, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5
+; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm18
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm22
+; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9]
+; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6
+; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm11
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13]
-; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm15
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm7
-; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm25
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14]
-; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm21
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm8
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
-; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm30
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm17
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm16
-; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm21
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm11
+; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm14
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10]
+; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm31
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm4
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: movb $24, %al
-; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0]
-; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11]
+; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm19
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5]
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18
-; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm23
-; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm23
-; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm4
+; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm4
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm19
-; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm20
-; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm12, %zmm10
-; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm4
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm5
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1
+; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm24
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12]
+; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm9
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13]
+; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm9
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14]
+; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm15
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm17
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15]
+; AVX512BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm16
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm18
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm22
+; AVX512BW-FCP-NEXT: movb $24, %al
+; AVX512BW-FCP-NEXT: kmovd %eax, %k1
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm27[4,5,4,5]
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm3, %zmm1
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7]
+; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm21
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm29
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm29
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,4,11]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm7, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[4,5,4,5],zmm8[4,5,4,5]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm17
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm24
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm24
+; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm12[4,5,4,5]
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27
+; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm27
+; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm1, %zmm11
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm23
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm23
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm12, %zmm16
+; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm1
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19
-; AVX512BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm0[4,5,4,5],zmm20[4,5,4,5]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm13
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [4,11]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm8
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm2
-; AVX512BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm8
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm9, %zmm7
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4
-; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm0
-; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm7
-; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm30, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
-; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm9
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,12]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7
+; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm3
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [6,13]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm7
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm0
+; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm9, %zmm10
+; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm9
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm24
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm1
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: movb $-32, %al
; AVX512BW-FCP-NEXT: kmovd %eax, %k2
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2}
; AVX512BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm6
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
+; AVX512BW-FCP-NEXT: vmovdqa 896(%rdi), %ymm6
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [2,5]
+; AVX512BW-FCP-NEXT: vpermt2q %ymm1, %ymm13, %ymm6
; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm17
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %ymm17
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm17
-; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm12
-; AVX512BW-FCP-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9
-; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm1
+; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm9
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm19 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpermt2q %ymm1, %ymm13, %ymm9
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm21, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm9
+; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm15
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm21 = ymm9[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpermt2q %ymm15, %ymm13, %ymm9
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm17, %zmm9
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa 1408(%rdi), %ymm14
+; AVX512BW-FCP-NEXT: vmovdqa 1344(%rdi), %ymm15
+; AVX512BW-FCP-NEXT: vpermi2q %ymm14, %ymm15, %ymm13
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm11, %zmm17
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm23, %zmm11
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm12
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm29, %zmm12
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k2}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm13
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm24, %zmm13
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k2}
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm15[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm16, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm14
+; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
+; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
; AVX512BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rdx)
+; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm10 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm14 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm14, %zmm10, %zmm14
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm15 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm10
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm22, %zmm15
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rsi)
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 192(%rdx)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 128(%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8)
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovaps %zmm1, 192(%r9)
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%r9)
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovaps %zmm1, 64(%r9)
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovaps %zmm1, 128(%r9)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 64(%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 128(%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%r8)
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%r9)
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%r9)
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%r9)
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%r9)
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax)
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vmovaps %zmm8, 128(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax)
-; AVX512BW-FCP-NEXT: vmovaps %zmm9, (%rax)
-; AVX512BW-FCP-NEXT: vmovaps %zmm11, 64(%rax)
-; AVX512BW-FCP-NEXT: addq $2760, %rsp # imm = 0xAC8
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax)
+; AVX512BW-FCP-NEXT: vmovaps %zmm14, (%rax)
+; AVX512BW-FCP-NEXT: vmovaps %zmm8, 64(%rax)
+; AVX512BW-FCP-NEXT: addq $2632, %rsp # imm = 0xA48
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -11116,474 +11138,481 @@ define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf32:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: subq $2760, %rsp # imm = 0xAC8
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm29
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12
+; AVX512DQ-BW-FCP-NEXT: subq $2632, %rsp # imm = 0xA48
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm22
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm30
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm17
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm15
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm13, %zmm3
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
-; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm14, %zmm3
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm13, %zmm3
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
-; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm24
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm16
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm14, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm8, %zmm5
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm5
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm29
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm18
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm9, %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm6, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm9, %zmm22
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm29
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm23
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm31
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm25, %zmm15
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm25
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm1, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm16
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
-; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
+; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm13, %zmm2
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm14, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm28
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm13, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm18
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm20
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm7
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm23
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm31
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm31
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm8, %zmm31
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm27
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm6, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm30 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm6, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm26
+; AVX512DQ-BW-FCP-NEXT: vpermt2q (%rsp), %zmm4, %zmm26 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm6, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm25
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm25
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm19
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm24
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm13, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm18, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm4, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm18, %zmm2, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm18
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm11
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm25
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
-; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm30
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm16
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm11
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm31
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: movb $24, %al
-; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0]
-; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm19
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm23
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm23
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm19
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm20
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm12, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm24
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15]
+; AVX512DQ-BW-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm18
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm22
+; AVX512DQ-BW-FCP-NEXT: movb $24, %al
+; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm27[4,5,4,5]
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm3, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm29
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm29
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm7 = [0,0,4,11]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm7, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm0[4,5,4,5],zmm8[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm11, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm24
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm24
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 {%k1} = zmm1[4,5,4,5],zmm12[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm27
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm27
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm1, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm12, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm7, %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm0[4,5,4,5],zmm20[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm13
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [4,11]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm9, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm30, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm10 = [5,12]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [6,13]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm9, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm9
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm24
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm4, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: movb $-32, %al
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm28 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm25 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 896(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [2,5]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm1, %ymm13, %ymm6
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm17
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %ymm17
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm17
-; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm6 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm19 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm1, %ymm13, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm21, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm21 = ymm9[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm15, %ymm13, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm17, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 1408(%rdi), %ymm14
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 1344(%rdi), %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm14, %ymm15, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm11, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm17 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm23, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm11 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm29, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm24, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm13 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm15[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm16, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm2, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm8, %zmm2
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
; AVX512DQ-BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm10 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm14 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm14, %zmm10, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm15 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm18, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm15 = ymm15[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm22, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 192(%rdx)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 128(%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 192(%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 64(%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 128(%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 64(%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 128(%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 192(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%r9)
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax)
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 128(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm9, (%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm11, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT: addq $2760, %rsp # imm = 0xAC8
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm14, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT: addq $2632, %rsp # imm = 0xA48
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <224 x i64>, ptr %in.vec, align 64
@@ -12740,52 +12769,52 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX-LABEL: load_i64_stride7_vf64:
; AVX: # %bb.0:
-; AVX-NEXT: subq $4232, %rsp # imm = 0x1088
-; AVX-NEXT: vmovaps 1216(%rdi), %ymm3
-; AVX-NEXT: vmovaps 768(%rdi), %ymm4
-; AVX-NEXT: vmovaps 320(%rdi), %ymm5
+; AVX-NEXT: subq $4248, %rsp # imm = 0x1098
+; AVX-NEXT: vmovaps 1216(%rdi), %ymm2
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 768(%rdi), %ymm6
+; AVX-NEXT: vmovaps 320(%rdi), %ymm7
; AVX-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX-NEXT: vmovaps 224(%rdi), %xmm10
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 224(%rdi), %xmm12
; AVX-NEXT: vmovaps 272(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX-NEXT: vmovaps 672(%rdi), %xmm11
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 672(%rdi), %xmm13
; AVX-NEXT: vmovaps 720(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
-; AVX-NEXT: vmovaps 1120(%rdi), %xmm12
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 1120(%rdi), %xmm14
; AVX-NEXT: vmovaps 1168(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1664(%rdi), %ymm6
+; AVX-NEXT: vmovaps 1664(%rdi), %ymm8
; AVX-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 1568(%rdi), %xmm15
+; AVX-NEXT: vmovaps 1616(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 2112(%rdi), %ymm9
+; AVX-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3]
-; AVX-NEXT: vmovapd 1568(%rdi), %xmm2
-; AVX-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovapd 1616(%rdi), %xmm1
-; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 2112(%rdi), %ymm7
-; AVX-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3]
; AVX-NEXT: vmovapd 2016(%rdi), %xmm2
; AVX-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovapd 2064(%rdi), %xmm1
@@ -12793,10 +12822,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 2560(%rdi), %ymm8
+; AVX-NEXT: vmovapd 2560(%rdi), %ymm10
; AVX-NEXT: vinsertf128 $1, 2624(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3]
; AVX-NEXT: vmovapd 2464(%rdi), %xmm2
; AVX-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovapd 2512(%rdi), %xmm1
@@ -12804,221 +12833,219 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 3008(%rdi), %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 3008(%rdi), %ymm11
; AVX-NEXT: vinsertf128 $1, 3072(%rdi), %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX-NEXT: vmovaps 2912(%rdi), %xmm0
-; AVX-NEXT: vmovaps 2960(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 3456(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT: vmovaps 3360(%rdi), %xmm15
-; AVX-NEXT: vmovaps 3408(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 96(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT: vmovaps 48(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 544(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT: vmovaps 448(%rdi), %xmm9
-; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 496(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 992(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT: vmovaps 896(%rdi), %xmm9
-; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 944(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1440(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT: vmovaps 1344(%rdi), %xmm9
-; AVX-NEXT: vmovaps 1392(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 1888(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 1952(%rdi), %ymm0, %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT: vmovaps 1792(%rdi), %xmm13
-; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 1840(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 2336(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT: vmovaps 2240(%rdi), %xmm13
-; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 2288(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 2784(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT: vmovaps 2688(%rdi), %xmm13
-; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 2736(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 3232(%rdi), %ymm2
-; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT: vmovaps 3136(%rdi), %xmm13
-; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps 3184(%rdi), %xmm2
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 2912(%rdi), %xmm2
+; AVX-NEXT: vmovaps 2960(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 3456(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 288(%rdi), %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 384(%rdi), %ymm2
+; AVX-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 3360(%rdi), %xmm1
+; AVX-NEXT: vmovaps 3408(%rdi), %xmm3
+; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 96(%rdi), %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 48(%rdi), %xmm3
+; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 544(%rdi), %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 448(%rdi), %xmm4
+; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 496(%rdi), %xmm3
+; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 992(%rdi), %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 896(%rdi), %xmm4
+; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 944(%rdi), %xmm3
+; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 1440(%rdi), %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 1344(%rdi), %xmm5
+; AVX-NEXT: vmovaps 1392(%rdi), %xmm4
+; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 1888(%rdi), %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, 1952(%rdi), %ymm0, %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT: vmovaps 1792(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 1840(%rdi), %xmm4
+; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 2336(%rdi), %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT: vmovaps 2240(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 2288(%rdi), %xmm4
+; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 2784(%rdi), %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT: vmovaps 2688(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 2736(%rdi), %xmm4
+; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 3232(%rdi), %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm3
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT: vmovaps 3136(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps 3184(%rdi), %xmm4
+; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 288(%rdi), %xmm3
+; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm12[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 384(%rdi), %ymm0
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm7[0],ymm0[1],ymm7[3],ymm0[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 736(%rdi), %xmm3
+; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm13[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 832(%rdi), %ymm0
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm0[1],ymm6[3],ymm0[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1184(%rdi), %xmm3
+; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 1280(%rdi), %ymm4
+; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[0],ymm4[1],ymm0[3],ymm4[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1632(%rdi), %xmm3
+; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm15[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 1728(%rdi), %ymm15
+; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm8[0],ymm15[1],ymm8[3],ymm15[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 2080(%rdi), %xmm3
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 2176(%rdi), %ymm14
+; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm9[0],ymm14[1],ymm9[3],ymm14[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 2528(%rdi), %xmm3
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 2624(%rdi), %ymm13
+; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm10[0],ymm13[1],ymm10[3],ymm13[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 2976(%rdi), %xmm3
+; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 3072(%rdi), %ymm12
+; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm11[0],ymm12[1],ymm11[3],ymm12[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[3],ymm2[2]
+; AVX-NEXT: vmovdqa 3424(%rdi), %xmm2
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovapd 3520(%rdi), %ymm11
+; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[3],ymm11[2]
; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 736(%rdi), %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 832(%rdi), %ymm14
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm4[0],ymm14[1],ymm4[3],ymm14[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT: vmovapd 160(%rdi), %ymm1
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 1184(%rdi), %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 1280(%rdi), %ymm13
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[3],ymm13[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[3],ymm1[2]
+; AVX-NEXT: vmovdqa 64(%rdi), %xmm2
+; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 1632(%rdi), %xmm1
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 1728(%rdi), %ymm12
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[0],ymm12[1],ymm6[3],ymm12[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT: vmovapd 608(%rdi), %ymm1
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 2080(%rdi), %xmm1
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 2176(%rdi), %ymm11
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm7[0],ymm11[1],ymm7[3],ymm11[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[3],ymm1[2]
+; AVX-NEXT: vmovdqa 512(%rdi), %xmm2
+; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 2528(%rdi), %xmm1
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 2624(%rdi), %ymm10
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[0],ymm10[1],ymm8[3],ymm10[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT: vmovapd 1056(%rdi), %ymm1
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 2976(%rdi), %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 3072(%rdi), %ymm2
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 3424(%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vmovapd 3520(%rdi), %ymm15
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[3],ymm15[2]
+; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[3],ymm1[2]
+; AVX-NEXT: vmovdqa 960(%rdi), %xmm2
+; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
+; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd 1504(%rdi), %ymm1
+; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[3],ymm1[2]
+; AVX-NEXT: vmovdqa 1408(%rdi), %xmm2
+; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 160(%rdi), %ymm0
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
-; AVX-NEXT: vmovdqa 64(%rdi), %xmm1
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 608(%rdi), %ymm0
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
-; AVX-NEXT: vmovdqa 512(%rdi), %xmm1
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1056(%rdi), %ymm0
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
-; AVX-NEXT: vmovdqa 960(%rdi), %xmm1
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1504(%rdi), %ymm0
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
-; AVX-NEXT: vmovdqa 1408(%rdi), %xmm1
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1952(%rdi), %ymm9
+; AVX-NEXT: vmovapd 1952(%rdi), %ymm10
; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[3],ymm9[2]
-; AVX-NEXT: vmovdqa 1856(%rdi), %xmm1
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[3],ymm10[2]
+; AVX-NEXT: vmovdqa 1856(%rdi), %xmm9
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 2400(%rdi), %ymm6
@@ -13048,75 +13075,76 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps 352(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT: # ymm1 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX-NEXT: vmovaps 240(%rdi), %xmm0
-; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT: vmovaps 240(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 800(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3]
-; AVX-NEXT: vmovapd 688(%rdi), %xmm7
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT: vmovaps 688(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 1248(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3]
-; AVX-NEXT: vmovapd 1136(%rdi), %xmm7
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT: # ymm1 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT: vmovaps 1136(%rdi), %xmm0
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 1696(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3]
; AVX-NEXT: vmovapd 1584(%rdi), %xmm7
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 2144(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3]
; AVX-NEXT: vmovapd 2032(%rdi), %xmm7
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 2592(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3]
; AVX-NEXT: vmovapd 2480(%rdi), %xmm7
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 3040(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
-; AVX-NEXT: vmovaps 2928(%rdi), %xmm7
-; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3]
+; AVX-NEXT: vmovapd 2928(%rdi), %xmm7
+; AVX-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 3488(%rdi), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3]
-; AVX-NEXT: vmovapd 3376(%rdi), %xmm1
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm1[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3]
+; AVX-NEXT: vmovapd 3376(%rdi), %xmm7
+; AVX-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 3264(%rdi), %xmm0
@@ -13124,7 +13152,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3]
; AVX-NEXT: vmovdqa 3152(%rdi), %xmm2
-; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm3[4,5,6,7]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13133,347 +13161,346 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3]
; AVX-NEXT: vmovdqa 2704(%rdi), %xmm1
-; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 2368(%rdi), %xmm14
-; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0
+; AVX-NEXT: vmovdqa 2368(%rdi), %xmm15
+; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm0
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3]
-; AVX-NEXT: vmovdqa 2256(%rdi), %xmm10
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm8[4,5,6,7]
+; AVX-NEXT: vmovdqa 2256(%rdi), %xmm1
+; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm8[4,5,6,7]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 1920(%rdi), %xmm8
-; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3]
-; AVX-NEXT: vmovapd 1808(%rdi), %xmm5
-; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = xmm5[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 1472(%rdi), %xmm7
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1
-; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX-NEXT: # ymm1 = ymm1[0,1,2],mem[3]
-; AVX-NEXT: vmovapd 1360(%rdi), %xmm3
-; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = xmm3[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1920(%rdi), %xmm14
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3]
+; AVX-NEXT: vmovdqa 1808(%rdi), %xmm6
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm9[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1472(%rdi), %xmm5
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT: vmovaps 1360(%rdi), %xmm10
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm10[0,1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa 1024(%rdi), %xmm4
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1
-; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT: # ymm2 = ymm1[0,1,2],mem[3]
-; AVX-NEXT: vmovapd 912(%rdi), %xmm1
-; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT: # xmm6 = xmm1[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 576(%rdi), %xmm9
-; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm2
-; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT: # ymm2 = ymm2[0,1,2],mem[3]
-; AVX-NEXT: vmovapd 464(%rdi), %xmm6
-; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT: # xmm11 = xmm6[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 128(%rdi), %xmm2
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm11
-; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX-NEXT: # ymm11 = ymm11[0,1,2],mem[3]
-; AVX-NEXT: vmovapd 16(%rdi), %xmm12
-; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload
-; AVX-NEXT: # xmm13 = xmm12[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3]
-; AVX-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 80(%rdi), %xmm13
-; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm12[1],ymm13[0],ymm12[2],ymm13[3]
-; AVX-NEXT: vmovdqa 192(%rdi), %xmm12
-; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 304(%rdi), %xmm2
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[1],ymm2[0],ymm0[2],ymm2[3]
-; AVX-NEXT: vmovdqa 416(%rdi), %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
-; AVX-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 528(%rdi), %xmm11
-; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm11[0],ymm6[2],ymm11[3]
-; AVX-NEXT: vmovdqa 640(%rdi), %xmm2
-; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3]
-; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 752(%rdi), %xmm9
-; AVX-NEXT: vmovupd %ymm9, (%rsp) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm9[0],ymm6[2],ymm9[3]
-; AVX-NEXT: vmovdqa 864(%rdi), %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT: vmovaps 912(%rdi), %xmm11
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm11[0,1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 576(%rdi), %xmm2
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT: vmovaps 464(%rdi), %xmm3
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm3[0,1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 128(%rdi), %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
+; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX-NEXT: # ymm1 = ymm1[0,1,2,3,4,5],mem[6,7]
+; AVX-NEXT: vmovaps 16(%rdi), %xmm12
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT: # xmm7 = xmm12[0,1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 192(%rdi), %xmm7
+; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT: vmovdqa 80(%rdi), %xmm13
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 416(%rdi), %xmm9
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX-NEXT: vmovdqa 304(%rdi), %xmm12
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 640(%rdi), %xmm1
+; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT: vmovdqa 528(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
-; AVX-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3]
-; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 976(%rdi), %xmm9
-; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm1[1],ymm9[0],ymm1[2],ymm9[3]
-; AVX-NEXT: vmovdqa 1088(%rdi), %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 864(%rdi), %xmm2
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
+; AVX-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX-NEXT: vmovdqa 752(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1088(%rdi), %xmm3
+; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3]
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1200(%rdi), %xmm6
-; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm6[0],ymm4[2],ymm6[3]
-; AVX-NEXT: vmovdqa 1312(%rdi), %xmm0
+; AVX-NEXT: vmovdqa 976(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3]
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1424(%rdi), %xmm0
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm0[0],ymm3[2],ymm0[3]
-; AVX-NEXT: vmovdqa 1536(%rdi), %xmm0
+; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm11[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1312(%rdi), %xmm4
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
+; AVX-NEXT: vmovdqa 1200(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
-; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1648(%rdi), %xmm7
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[3]
-; AVX-NEXT: vmovdqa 1760(%rdi), %xmm0
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
+; AVX-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm8[4,5,6,7]
+; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1536(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
-; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1872(%rdi), %xmm0
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[1],ymm0[0],ymm5[2],ymm0[3]
-; AVX-NEXT: vmovdqa 1984(%rdi), %xmm0
+; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
+; AVX-NEXT: vmovdqa 1424(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 2096(%rdi), %xmm3
-; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
-; AVX-NEXT: vmovdqa 2208(%rdi), %xmm3
-; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 2320(%rdi), %xmm0
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[2],ymm0[3]
-; AVX-NEXT: vmovdqa 2432(%rdi), %xmm3
-; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 2544(%rdi), %xmm14
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm14[0],ymm0[2],ymm14[3]
-; AVX-NEXT: vmovdqa 2656(%rdi), %xmm4
-; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 2768(%rdi), %xmm4
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[1],ymm4[0],ymm0[2],ymm4[3]
-; AVX-NEXT: vmovdqa 2880(%rdi), %xmm8
+; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm10[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1760(%rdi), %xmm8
; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload
; AVX-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3]
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 2992(%rdi), %xmm5
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
-; AVX-NEXT: vmovdqa 3104(%rdi), %xmm5
-; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3]
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 3216(%rdi), %xmm5
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
+; AVX-NEXT: vmovdqa 1648(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3],ymm5[4,5,6,7]
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 1984(%rdi), %xmm5
+; AVX-NEXT: vpalignr {{.*#+}} xmm10 = xmm14[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
+; AVX-NEXT: vmovdqa 1872(%rdi), %xmm14
+; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 2208(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT: vmovdqa 2096(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 2432(%rdi), %xmm11
+; AVX-NEXT: vpalignr {{.*#+}} xmm6 = xmm15[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT: vmovdqa 2320(%rdi), %xmm15
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 2656(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT: vmovdqa 2544(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 2880(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT: vmovdqa 2768(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 3104(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT: vmovdqa 2992(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa 3328(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
-; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3]
-; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 3440(%rdi), %xmm5
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
-; AVX-NEXT: vmovdqa 3552(%rdi), %xmm4
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3]
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
-; AVX-NEXT: vmovapd 128(%rdi), %ymm6
+; AVX-NEXT: vmovdqa 3216(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovdqa 3552(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
+; AVX-NEXT: vmovdqa 3440(%rdi), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: # xmm10 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm6
+; AVX-NEXT: vmovapd 128(%rdi), %ymm10
+; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1,2],ymm6[3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm7 = mem[0,1,2,3],xmm13[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3]
; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3]
-; AVX-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm13[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3]
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
-; AVX-NEXT: vmovapd 352(%rdi), %ymm13
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3]
-; AVX-NEXT: vmovapd 256(%rdi), %xmm6
-; AVX-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT: # xmm6 = xmm6[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3]
-; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
+; AVX-NEXT: vmovapd 352(%rdi), %ymm7
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3]
+; AVX-NEXT: vmovdqa 256(%rdi), %xmm9
+; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm9[0,1,2,3],xmm12[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3]
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vmovapd 576(%rdi), %ymm6
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3]
-; AVX-NEXT: vmovapd 480(%rdi), %xmm0
-; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm0[0],xmm11[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload
-; AVX-NEXT: vmovapd 800(%rdi), %ymm11
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3]
-; AVX-NEXT: vmovapd 704(%rdi), %xmm5
-; AVX-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendpd $2, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm5[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vmovapd 1024(%rdi), %ymm5
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3]
-; AVX-NEXT: vmovapd 928(%rdi), %xmm0
-; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm9[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
-; AVX-NEXT: vmovaps 1248(%rdi), %ymm0
-; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT: vmovaps 1152(%rdi), %xmm0
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = xmm0[0,1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
-; AVX-NEXT: vmovaps 1472(%rdi), %ymm9
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT: vmovaps 1376(%rdi), %xmm15
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = xmm15[0,1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
-; AVX-NEXT: vmovapd 1696(%rdi), %ymm0
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3]
+; AVX-NEXT: vmovapd 480(%rdi), %xmm1
+; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm1[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
-; AVX-NEXT: vmovapd 1600(%rdi), %xmm0
-; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm7[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
-; AVX-NEXT: vmovaps 1920(%rdi), %ymm7
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
-; AVX-NEXT: vmovaps 1824(%rdi), %xmm12
-; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = xmm12[0,1],mem[2,3]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX-NEXT: vmovapd 800(%rdi), %ymm12
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3]
+; AVX-NEXT: vmovapd 704(%rdi), %xmm13
+; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm13[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX-NEXT: vmovapd 1024(%rdi), %ymm2
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3]
+; AVX-NEXT: vmovapd 928(%rdi), %xmm1
+; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm1[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX-NEXT: vmovaps 1248(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
-; AVX-NEXT: vmovapd 2144(%rdi), %ymm10
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3]
-; AVX-NEXT: vmovapd 2048(%rdi), %xmm0
-; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = xmm0[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
-; AVX-NEXT: vmovapd 2368(%rdi), %ymm3
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3]
-; AVX-NEXT: vmovapd 2272(%rdi), %xmm0
-; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = xmm0[0],mem[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
-; AVX-NEXT: vmovapd 2592(%rdi), %ymm0
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 1152(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX-NEXT: vmovapd 1472(%rdi), %ymm3
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3]
+; AVX-NEXT: vmovapd 1376(%rdi), %xmm1
+; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm1[0],mem[1]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
-; AVX-NEXT: vmovapd 2496(%rdi), %xmm0
-; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm14[1]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
-; AVX-NEXT: vmovapd 2816(%rdi), %ymm2
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3]
-; AVX-NEXT: vmovapd 2720(%rdi), %xmm1
+; AVX-NEXT: vmovapd 1696(%rdi), %ymm8
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3]
+; AVX-NEXT: vmovapd 1600(%rdi), %xmm1
; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX-NEXT: # xmm1 = xmm1[0],mem[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX-NEXT: vmovapd 1920(%rdi), %ymm4
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3]
+; AVX-NEXT: vmovdqa 1824(%rdi), %xmm1
+; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm14[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX-NEXT: vmovaps 3040(%rdi), %ymm1
+; AVX-NEXT: vmovaps 2144(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX-NEXT: vmovaps 2944(%rdi), %xmm1
+; AVX-NEXT: vmovaps 2048(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm0
+; AVX-NEXT: vmovapd 2368(%rdi), %ymm5
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3]
+; AVX-NEXT: vmovdqa 2272(%rdi), %xmm1
+; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm15[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX-NEXT: vmovaps 2592(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 2496(%rdi), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
-; AVX-NEXT: vmovapd 3264(%rdi), %ymm14
+; AVX-NEXT: vmovapd 2816(%rdi), %ymm14
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3]
-; AVX-NEXT: vmovapd 3168(%rdi), %xmm1
+; AVX-NEXT: vmovapd 2720(%rdi), %xmm1
; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX-NEXT: # xmm1 = xmm1[0],mem[1]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX-NEXT: vmovaps 3040(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 2944(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX-NEXT: vmovaps 3264(%rdi), %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX-NEXT: vmovaps 3168(%rdi), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
; AVX-NEXT: vmovaps 3488(%rdi), %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
@@ -13483,21 +13510,19 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 192(%rdi), %ymm1
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
+; AVX-NEXT: vmovapd 192(%rdi), %ymm0
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[3],ymm0[2]
; AVX-NEXT: vmovdqa 96(%rdi), %xmm1
; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 416(%rdi), %ymm0
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[3],ymm0[2]
+; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[3],ymm0[2]
; AVX-NEXT: vmovdqa 320(%rdi), %xmm1
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa 544(%rdi), %xmm0
@@ -13510,11 +13535,10 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 864(%rdi), %ymm0
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[3],ymm0[2]
+; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[3],ymm0[2]
; AVX-NEXT: vmovdqa 768(%rdi), %xmm1
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm13[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa 992(%rdi), %xmm0
@@ -13522,7 +13546,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX-NEXT: vmovapd 1088(%rdi), %ymm1
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[3],ymm1[2]
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[3],ymm1[2]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 1312(%rdi), %ymm1
@@ -13536,28 +13560,30 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa 1440(%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX-NEXT: vmovapd 1536(%rdi), %ymm15
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0],ymm15[1],ymm9[3],ymm15[2]
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm15[1],ymm3[3],ymm15[2]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 1760(%rdi), %ymm1
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
+; AVX-NEXT: vmovapd 1760(%rdi), %ymm0
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm8[0],ymm0[1],ymm8[3],ymm0[2]
; AVX-NEXT: vmovdqa 1664(%rdi), %xmm13
; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa 1888(%rdi), %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX-NEXT: vmovapd 1984(%rdi), %ymm11
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[3],ymm11[2]
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm4[0],ymm11[1],ymm4[3],ymm11[2]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 2208(%rdi), %ymm12
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[3],ymm12[2]
+; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[3],ymm12[2]
; AVX-NEXT: vmovdqa 2112(%rdi), %xmm10
; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
@@ -13567,7 +13593,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX-NEXT: vmovapd 2432(%rdi), %ymm9
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[3],ymm9[2]
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm9[1],ymm5[3],ymm9[2]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 2656(%rdi), %ymm8
@@ -13582,7 +13608,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX-NEXT: vmovapd 2880(%rdi), %ymm5
-; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm5[1],ymm2[3],ymm5[2]
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm14[0],ymm5[1],ymm14[3],ymm5[2]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 3104(%rdi), %ymm6
@@ -13597,7 +13623,8 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
; AVX-NEXT: vmovapd 3328(%rdi), %ymm3
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm14[0],ymm3[1],ymm14[3],ymm3[2]
+; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[3],ymm3[2]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovapd 3552(%rdi), %ymm2
@@ -13618,7 +13645,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload
; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
; AVX-NEXT: # xmm14 = mem[0,1],xmm14[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
@@ -13854,7 +13881,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %ymm1, 192(%r9)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm1, 160(%r9)
-; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm1, 128(%r9)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm1, 96(%r9)
@@ -13920,7 +13947,7 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX-NEXT: vmovaps %ymm0, 32(%rax)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, (%rax)
-; AVX-NEXT: addq $4232, %rsp # imm = 0x1088
+; AVX-NEXT: addq $4248, %rsp # imm = 0x1098
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -18242,950 +18269,938 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512-FCP-LABEL: load_i64_stride7_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8
-; AVX512-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16
-; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm8
-; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm19
-; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20
-; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2
+; AVX512-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08
+; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2
; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18
-; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm14
+; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm15
+; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm26
; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7
; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17
-; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9
-; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11
-; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13
-; AVX512-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14
-; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm4
-; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm10
-; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6
+; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28
+; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm16
+; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6
; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm15
-; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6
-; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
-; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1
-; AVX512-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
-; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1
-; AVX512-FCP-NEXT: vmovdqa 912(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
-; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
-; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
-; AVX512-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
+; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512-FCP-NEXT: vmovdqa 2752(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqa 2688(%rdi), %ymm1
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm1
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %ymm1, %ymm3, %ymm0
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 1408(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqa 1344(%rdi), %ymm1
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqa 896(%rdi), %ymm1
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 2304(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqa 2240(%rdi), %ymm1
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 1856(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqa 1792(%rdi), %ymm1
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqa 3136(%rdi), %ymm1
+; AVX512-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm3
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm0
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13]
+; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm0
+; AVX512-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
+; AVX512-FCP-NEXT: vmovdqa 2816(%rdi), %ymm2
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm4
+; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
+; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm13, %zmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm17
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm4
+; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm4
+; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm8, %zmm0
-; AVX512-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm4
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
-; AVX512-FCP-NEXT: vmovdqa 2816(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm11
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm6
+; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm13, %zmm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm24
+; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm30
+; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm6
+; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm6
+; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm6
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
+; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0]
-; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2
-; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
-; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm2
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2
-; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm12
+; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7
+; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm13, %zmm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm29
+; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm22
+; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm7
+; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm7
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6
+; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm16
-; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm31
-; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm3
-; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
-; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13
-; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm24
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm18
+; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm8
+; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm7
+; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm13, %zmm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm27
+; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm23
+; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm7
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm7
+; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm7
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6
; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
+; AVX512-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm6
+; AVX512-FCP-NEXT: vmovdqa 2368(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm21
+; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm7
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
+; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm13, %zmm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm9
+; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm9
+; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm9
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm5
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5
-; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm3
-; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19
-; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7
-; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6
-; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17
-; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm8
-; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
-; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3
+; AVX512-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm6
+; AVX512-FCP-NEXT: vmovdqa 1920(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm8
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6
-; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3
-; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30
-; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm29
-; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm10
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm10
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm31
+; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm10
; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm10
-; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8
-; AVX512-FCP-NEXT: vmovdqa 2368(%rdi), %ymm3
-; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm18
-; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm25, %zmm10
+; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm10
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm20
-; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm14
-; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm14
-; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm21
-; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm14
-; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10
-; AVX512-FCP-NEXT: vmovdqa 1920(%rdi), %ymm3
-; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm22
-; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm15
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm26
-; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm15
-; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm15
-; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm28
-; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm15
-; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10
-; AVX512-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3
-; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm25
-; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm14
-; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
-; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm10
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm6
+; AVX512-FCP-NEXT: vmovdqa 3264(%rdi), %ymm0
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10
+; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm1
+; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm1, %zmm13
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm25, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm11
+; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm25, %zmm11
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm19
+; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm8
+; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm27
-; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm9
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm3
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm25, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3
-; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5
-; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5
-; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm9
-; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm9
-; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm9
-; AVX512-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm25
+; AVX512-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm0, %zmm11
-; AVX512-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm6
-; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm11
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4
-; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 2880(%rdi), %ymm4
-; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm4
-; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0]
-; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm16
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm10
-; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10
-; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6
-; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm10
-; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm10
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm10
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm25
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm25, %zmm3
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 2880(%rdi), %ymm3
+; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [5,12]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm3
+; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm25
+; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [9,0,7,0,9,0,7,0]
+; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm12
+; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm12
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11
+; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm11
+; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm11
+; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm26
+; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm11
+; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm11
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm11
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm1
; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2
-; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
+; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm5
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1
; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm2
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4
+; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm11
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8
+; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm4
+; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm1
; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm2
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm4
+; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
+; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm2, %zmm4
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 2432(%rdi), %ymm1
; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm4
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm15
+; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm4
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 1984(%rdi), %ymm1
; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm2
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 3328(%rdi), %ymm1
; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm7
-; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm2
-; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2
-; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm2
-; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm18
-; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm22
-; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm25
-; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm25
-; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm0
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
-; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm12
-; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm23, %zmm12
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4]
-; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm24, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5]
-; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm0, %zmm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm4
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm20
+; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6
+; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm6
+; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm10
+; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm0, %zmm2
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6]
-; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
+; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm13
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512-FCP-NEXT: movb $24, %al
+; AVX512-FCP-NEXT: kmovw %eax, %k1
+; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4
+; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm4[4,5,4,5]
+; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0]
+; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
-; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10
-; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [10,3,10,3,10,3,10,3]
+; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2
+; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm8, %zmm0
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4]
+; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [12,5,12,5,12,5,12,5]
+; AVX512-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm26, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm28
-; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14
-; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18
-; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm8, %zmm0
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [13,6,13,6,13,6,13,6]
+; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0
+; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9]
+; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm17
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,8,15,4,5,8,15]
+; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm17
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm22
-; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm9
-; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm9
; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19
+; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm19, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm20, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm26, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm30
-; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm7
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm21
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm30
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm30
-; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22
-; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm5
-; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm11
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm28
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm28
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm31
-; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3
-; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm6, %zmm23
+; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm7, %zmm30
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5]
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm27
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm27
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm12, %zmm30
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm19, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm20, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm26, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm31, %zmm24
-; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm25
-; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm25
-; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0
-; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm16
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm31
-; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm15
-; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm31
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
-; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm10
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm7, %zmm22
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm3[4,5,4,5]
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4
; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm9
-; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm14
-; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm30
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm7
-; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm28
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm23
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
-; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm18
-; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19
-; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm21
-; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm22
-; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm24
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
-; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
-; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm11
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm29
-; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm16
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
-; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm18
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm19
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm21
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm22
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm11
-; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31
-; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: movb $24, %al
-; AVX512-FCP-NEXT: kmovw %eax, %k2
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5]
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
-; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm20
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm20
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm22
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm29
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29
-; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm17
-; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm26, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm7, %zmm23
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm2
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm17
+; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm17[4,5,4,5]
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm6
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5]
+; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm26
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm26
-; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm23
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5]
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28
+; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm19, %zmm28
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm26, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512-FCP-NEXT: movb $-32, %al
-; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm25, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm7, %zmm2
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm19, %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm20, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm25, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm7, %zmm23
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm30
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm30
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm16
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm15, %zmm19
+; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm15
+; AVX512-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm14
+; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm24
+; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm8 # 64-byte Folded Reload
; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm9
; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1}
+; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm11
+; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm6
; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm22
+; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm11
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm27
+; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm9
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm8
+; AVX512-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6
+; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm25, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm20
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm18
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm18
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm26
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm10
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm10
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm25
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm16
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm13
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,9,0,5,6,9]
+; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1}
+; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm7, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm1
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm7, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm7, %zmm28
+; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm30
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm19
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,10,0,5,6,10]
+; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm24
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm7, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm7, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm20
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,11,0,5,6,11]
+; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm18
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm7, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm7, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm26
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12]
+; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm7, %zmm31
+; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm21
+; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm10
+; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm7, %zmm1
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm25
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm4
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm23
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm16
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm13
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [14,0,0,7,14,0,0,7]
+; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm11[4,5,4,5],zmm22[4,5,4,5]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm7
+; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm7
+; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm1
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm11
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm9[4,5,4,5],zmm27[4,5,4,5]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm17
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm17
+; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm21
+; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm21
+; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm9
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm5[4,5,4,5],zmm8[4,5,4,5]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm22
+; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm22
+; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm5, %zmm12
+; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm27
+; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm27
+; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm8, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k1} = zmm14[4,5,4,5],zmm15[4,5,4,5]
+; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm14
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [6,13]
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm5[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: movb $-32, %al
+; AVX512-FCP-NEXT: kmovw %eax, %k2
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2}
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa 2752(%rdi), %ymm3
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm1
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
-; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm4
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
-; AVX512-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm8
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4
-; AVX512-FCP-NEXT: vmovdqa 2304(%rdi), %ymm10
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa 1856(%rdi), %ymm12
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1}
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11
-; AVX512-FCP-NEXT: vmovdqa 3200(%rdi), %ymm13
-; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 448(%rsi)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm13, 384(%rsi)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm13, 320(%rsi)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm13, 256(%rsi)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm13, 192(%rsi)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm13, 128(%rsi)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm13, 64(%rsi)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm13, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 448(%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 320(%rdx)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rdx)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rdx)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rdx)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, 384(%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 448(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 320(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 128(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 192(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 384(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 448(%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 256(%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 320(%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 384(%r8)
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm30
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k2}
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
+; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm4 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm9 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2}
+; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm13 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 {%k2}
+; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm14 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 {%k2}
+; AVX512-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm12 {%k2}
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm11
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm27, %zmm11
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 {%k2}
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm15
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm3, %zmm15
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2}
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm3, %xmm16
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm3, %zmm16
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2}
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm3, %xmm17
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm3, %zmm17
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2}
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm3, %xmm19
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm3, %zmm19
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k2}
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm3, %xmm20
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm3, %zmm7
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2}
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm3, %xmm20
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm21, %zmm3
+; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 {%k2}
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
+; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm20
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm18, %zmm18
+; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2}
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm20, 448(%rsi)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm20, 384(%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 320(%rsi)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm20, 256(%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 192(%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm31, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 320(%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 128(%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 192(%rdx)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, (%rdx)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rdx)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 448(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 256(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 320(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 128(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 192(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 384(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 448(%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 256(%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 320(%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 128(%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 192(%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 64(%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 384(%r8)
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm3, 448(%r9)
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
@@ -19203,35 +19218,40 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm3, 384(%r9)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, 448(%rax)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, 256(%rax)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, 320(%rax)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rax)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rax)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, (%rax)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax)
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm3, 384(%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovaps %zmm11, 384(%rax)
-; AVX512-FCP-NEXT: vmovaps %zmm4, 448(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax)
-; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 192(%rax)
+; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovaps %zmm0, 192(%rax)
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax)
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8
+; AVX512-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -20232,950 +20252,938 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-FCP-LABEL: load_i64_stride7_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8
-; AVX512DQ-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20
-; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT: subq $6664, %rsp # imm = 0x1A08
+; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm26
; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28
+; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm16
+; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm15
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [2,5]
+; AVX512DQ-FCP-NEXT: vmovdqa 2752(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 2688(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %ymm1, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 1408(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 1344(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 896(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 2304(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 2240(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 1856(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 1792(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %ymm0, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 3200(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 3136(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vpermi2q %ymm0, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 912(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa 2816(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm13, %zmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm13, %zmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm8, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm4
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa 2816(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm20
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm6
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm13, %zmm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm24
+; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm30
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm6
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0]
-; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm13, %zmm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm29
+; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm22
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm7
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm7
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm16
-; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm31
-; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13
-; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm24
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm18
+; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm8
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm13, %zmm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm27
+; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm23
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm7
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm7
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm7, %zmm6
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa 2368(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm21
+; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm13, %zmm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm9
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm9, %zmm6
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm5
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17
-; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa 1920(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm8
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30
-; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm29
-; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm10
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm10
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa 2368(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm18
-; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm25, %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm20
-; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm14
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm21
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm14
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqa 1920(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm26
-; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm15
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm28
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm15
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm25
-; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm14
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm10
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa 3264(%rdi), %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm1, %zmm13
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm25, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm19
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm8
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm27
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm5
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm9
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm25
+; AVX512DQ-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm0, %zmm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm11
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 2880(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm4
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm16
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm5, %zmm25
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm25, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 2880(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm6 = [5,12]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm3
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm6, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm25
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm12
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm12
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm12 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm12, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm11
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm6, %zmm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm26
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm11
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm2, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm11
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm11, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm5
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm5, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm2, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm2, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 2432(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm15
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm21 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 1984(%rdi), %ymm1
; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 3328(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm6
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm2, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm2, %zmm20
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm2, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm0, %zmm2
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9
+; AVX512DQ-FCP-NEXT: movb $24, %al
+; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
+; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm1[4,5,4,5],zmm4[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 3328(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm18
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm25
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm25
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm19, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm23, %zmm12
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm24, %zmm0
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm20 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm20, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm26 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-FCP-NEXT: # zmm26 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm26, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
-; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm0
+; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9]
+; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm17
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,8,15,4,5,8,15]
+; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm17
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm17, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm19, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm20, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm26, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm7, %zmm30
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm5[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm14, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm12, %zmm30
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm28
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm19, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm20, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm26, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm9
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm7, %zmm22
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm3[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm11
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm30
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm21
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm3
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm14, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm22
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm30
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm30
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm11
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm28
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm28
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm31
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm6, %zmm23
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm3
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm27
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm27
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm26, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm7, %zmm23
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm2
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,4,5],zmm17[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm6
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm14, %zmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm23
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm23, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm19, %zmm28
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm20, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm26, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm31, %zmm24
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm25
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm25
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm16
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm31
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm31
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
-; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm14
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm30
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm28
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm23
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
-; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm18
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm21
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm24
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
-; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
-; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm25, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm29
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm16
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
-; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm18
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm21
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm7, %zmm2
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm19, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm20, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm26, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm25, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm7, %zmm23
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm30
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm19, %zmm30
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm16
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm15, %zmm19
+; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm24
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm20, %zmm24
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm25, %zmm8 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm11
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: movb $24, %al
-; AVX512DQ-FCP-NEXT: kmovw %eax, %k2
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm25, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm22
+; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm25, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm27
+; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm8
+; AVX512DQ-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm25, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm20
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm18
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm18
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm26
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm10
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm10
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm25
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm16
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm13
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,9,0,5,6,9]
+; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
-; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm20
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm20
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm29
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm17
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm7, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm7, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm2
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm7, %zmm28
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm30
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm19
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,10,0,5,6,10]
+; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm24
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm26
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm20
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,11,0,5,6,11]
+; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm18
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm26
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [0,5,6,12,0,5,6,12]
+; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm7, %zmm31
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm21
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm26
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: movb $-32, %al
-; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm7, %zmm10
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm7, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm7, %zmm25
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm4
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm23
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm12, %zmm16
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm12, %zmm13
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm11[4,5,4,5],zmm22[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm7
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,0,4,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm9[4,5,4,5],zmm27[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm17
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm21
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm21
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 {%k1} = zmm5[4,5,4,5],zmm8[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm22
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm22
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm5, %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm27
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm27
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm8, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k1} = zmm14[4,5,4,5],zmm15[4,5,4,5]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm14
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm2 = [6,13]
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 64-byte Folded Reload
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm15 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm23, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm5[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm13, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: movb $-32, %al
+; AVX512DQ-FCP-NEXT: kmovw %eax, %k2
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k2}
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0 {%k2}
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
-; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 2752(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
-; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm8
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa 2304(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 1856(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11
-; AVX512DQ-FCP-NEXT: vmovdqa 3200(%rdi), %ymm13
-; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 448(%rsi)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 384(%rsi)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 320(%rsi)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 256(%rsi)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 192(%rsi)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 128(%rsi)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 64(%rsi)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm13, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 448(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 320(%rdx)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rdx)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rdx)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rdx)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 384(%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 448(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 320(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 128(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 192(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 384(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 448(%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 256(%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 320(%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 384(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm28 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm29 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm8 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm24 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm23 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm30
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm22, %zmm4 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm4 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm8 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm9 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm9 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm10 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10 {%k2}
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm13 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 {%k2}
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm14 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm14 {%k2}
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm12 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm11
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm27, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm11 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm15
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm3, %zmm15
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm3, %xmm16
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm3, %zmm16
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm3, %xmm17
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm3, %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm3, %xmm19
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm3, %zmm19
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm3, %xmm20
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm3, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm7 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm3, %xmm20
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm21, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm18 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm3 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm20
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm18, %zmm18
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k2}
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm20, 448(%rsi)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm20, 384(%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 320(%rsi)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm20, 256(%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 192(%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 256(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 320(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 128(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 192(%rdx)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rdx)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 448(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 256(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 320(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 128(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 192(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 384(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 448(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 320(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 128(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 192(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 64(%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 384(%r8)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 448(%r9)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
@@ -21193,35 +21201,40 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 384(%r9)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 448(%rax)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 256(%rax)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 320(%rax)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rax)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rax)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rax)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 384(%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovaps %zmm11, 384(%rax)
-; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 448(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax)
-; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 384(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 448(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 256(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 192(%rax)
+; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 192(%rax)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512DQ-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8
+; AVX512DQ-FCP-NEXT: addq $6664, %rsp # imm = 0x1A08
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -22206,932 +22219,949 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512BW-FCP-LABEL: load_i64_stride7_vf64:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8
-; AVX512BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm4
+; AVX512BW-FCP-NEXT: subq $6792, %rsp # imm = 0x1A88
+; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm19
+; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm16
+; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29
+; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm9
+; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm14
+; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm4
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm6
+; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm21
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm18
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm17
+; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm8
+; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm20
+; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm27
+; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm31
+; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm23
+; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13
+; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm25
-; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18
-; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30
-; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm5
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm19
+; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm22
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm25
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm15
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,7,14,0,0,7,14,0]
+; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512BW-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm24, %zmm1
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13]
+; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm8
+; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm1
+; AVX512BW-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa 2816(%rdi), %ymm0
+; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3
+; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1
+; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm1
+; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm0
+; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1
+; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm24, %zmm1
+; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm1
+; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm25
+; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm0
+; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm10
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm12
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm11
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm17
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3
+; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm24, %zmm3
+; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm3
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm0
+; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30
+; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm28
+; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm4
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa 2368(%rdi), %ymm0
+; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm21
+; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm4
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm18
+; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm6
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm24, %zmm6
+; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm6
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa 1920(%rdi), %ymm0
+; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm22
+; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm6
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm26
+; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm13
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm13
+; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm13
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm13
+; AVX512BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm0
+; AVX512BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm13[6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm11
+; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm6
+; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm6, %zmm12
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm24, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm24, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm8
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm24, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm24, %zmm1
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
+; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm1
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa 2816(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0]
+; AVX512BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm24
+; AVX512BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm24
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm24, %zmm10
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm10
+; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm12
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0]
; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm16
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14]
+; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm12, %zmm16
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm16, %zmm2
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
-; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm13
+; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27
-; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
-; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28
-; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm13
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm24
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm13
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm16
+; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm13
+; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm13
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm9, %zmm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18
-; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm7
+; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm9, %zmm7
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm7
+; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm7 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm26
-; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm0
+; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm19
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm1
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm13
-; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm1
+; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm19
-; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm22
-; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm0
+; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm20
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm1
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm1
+; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa 2368(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm20
-; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqa 2432(%rdi), %ymm0
+; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm9, %zmm1
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm15
-; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm1
+; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa 1920(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21
-; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm9, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa 1984(%rdi), %ymm0
+; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm1
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm14
-; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm23
-; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1
+; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm17
-; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm11
-; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm11, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9
-; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9
-; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm6
+; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm6
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
-; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6
-; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm11
+; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm23
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm25
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm21
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm22
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm24
-; AVX512BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm10
-; AVX512BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm30
-; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm10
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm15, %zmm10
+; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vmovdqa 3328(%rdi), %ymm0
+; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm11, %zmm1, %zmm9
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm0
-; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
-; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
-; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm12
-; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm12
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
-; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm9
-; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm12
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm12
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
-; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm12
-; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm9
-; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm16
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm12
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9
-; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm12
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa 2432(%rdi), %ymm9
-; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm9
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa 1984(%rdi), %ymm2
-; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm10, %zmm3
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa 3328(%rdi), %ymm2
-; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [10,3,10,3,10,3,10,3]
+; AVX512BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm29, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [11,4,11,4,11,4,11,4]
+; AVX512BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm1
+; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm1
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm19
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm20
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm21
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm17
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm4
-; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm4
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [12,5,12,5,12,5,12,5]
+; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm9, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [13,6,13,6,13,6,13,6]
+; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
-; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm25
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4]
-; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm27, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5]
-; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6]
-; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm25, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm22
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm20
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm29, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm20
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm29, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9
-; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm31, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16
-; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm29
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm9, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm30
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm13
+; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm13
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm31
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm11
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm29, %zmm11
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm12
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm12
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm19
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm19
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm31
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm29, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22
+; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm5, %zmm29
+; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm8
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm31
+; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
+; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm26
+; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm24
+; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm23
+; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm10
+; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm15
+; AVX512BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm14
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5
+; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27
+; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm22, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm6
+; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm22, %zmm9
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512BW-FCP-NEXT: vpermi2q %zmm3, %zmm22, %zmm25
+; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm27
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm28
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
+; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm10
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm13
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm21
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm4
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm7
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm29
-; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm8, %zmm23
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm6
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm11
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm13
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm7
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm29
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
+; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm8
+; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm8
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm8
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm27
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0
-; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm31
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm8
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18
-; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm12
-; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm29
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm26
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
-; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm11
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm25
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm22
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm19
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm23
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
-; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm10
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm4
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm27
+; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm12
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm6
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm18
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm9
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm17
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm13
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm20
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm31
; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm18
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm16
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm16
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm7
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm12
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm19
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25
; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm22
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm16
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm18
+; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm16
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm20
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm21
-; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm29
-; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm26
+; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm20
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27
+; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm28
; AVX512BW-FCP-NEXT: movb $24, %al
-; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
-; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0]
-; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: kmovd %eax, %k2
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm1[4,5,4,5],zmm31[4,5,4,5]
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [14,0,0,7,14,0,0,7]
+; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm17, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
+; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
+; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm1[4,5,4,5],zmm30[4,5,4,5]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm17, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm1[4,5,4,5],zmm26[4,5,4,5]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm30
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm1[4,5,4,5],zmm24[4,5,4,5]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5]
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3
-; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[4,5,4,5],zmm23[4,5,4,5]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24
+; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm17, %zmm24
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5]
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm28
-; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm6, %zmm30
-; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm15
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5]
-; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13
+; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm1[4,5,4,5],zmm10[4,5,4,5]
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm14[4,5,4,5],zmm15[4,5,4,5]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm17, %zmm21
+; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm14, %zmm17
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm23
+; AVX512BW-FCP-NEXT: vpermi2q %zmm14, %zmm15, %zmm18
+; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm14
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[4,5,4,5],zmm2[4,5,4,5]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm0 = ymm15[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
; AVX512BW-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
; AVX512BW-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17
-; AVX512BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload
; AVX512BW-FCP-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7]
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm14[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: movb $-32, %al
-; AVX512BW-FCP-NEXT: kmovd %eax, %k2
+; AVX512BW-FCP-NEXT: kmovd %eax, %k1
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1}
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2}
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2}
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2}
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
; AVX512BW-FCP-NEXT: vmovdqa 2752(%rdi), %ymm0
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm0
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-FCP-NEXT: vmovdqa 2688(%rdi), %ymm9
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [2,5]
+; AVX512BW-FCP-NEXT: vpermt2q %ymm0, %ymm14, %ymm9
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm21, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm9
+; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm10
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpermt2q %ymm9, %ymm14, %ymm10
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm1, %zmm10
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm9
+; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm11
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm19 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpermt2q %ymm11, %ymm14, %ymm9
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa 1408(%rdi), %ymm9
+; AVX512BW-FCP-NEXT: vmovdqa 1344(%rdi), %ymm11
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm11[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpermt2q %ymm9, %ymm14, %ymm11
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm30, %zmm11
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm9
+; AVX512BW-FCP-NEXT: vmovdqa 896(%rdi), %ymm12
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm16 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpermt2q %ymm9, %ymm14, %ymm12
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa 2304(%rdi), %ymm9
+; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %ymm21
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm27 = ymm21[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm21[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpermt2q %ymm9, %ymm14, %ymm21
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm24, %zmm9
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %ymm21
+; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %ymm22
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm28 = ymm22[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],ymm22[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vpermt2q %ymm21, %ymm14, %ymm22
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm13, %zmm21
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa64 3200(%rdi), %ymm22
+; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %ymm24
+; AVX512BW-FCP-NEXT: vpermi2q %ymm22, %ymm24, %ymm14
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm17, %zmm14
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm23, %zmm13
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm5
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa 2304(%rdi), %ymm6
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa 1856(%rdi), %ymm7
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7
-; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm8
-; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
-; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rsi)
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovaps %zmm12, 384(%rsi)
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovaps %zmm12, 320(%rsi)
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovaps %zmm12, 256(%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%rsi)
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovaps %zmm12, 128(%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rsi)
-; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512BW-FCP-NEXT: vmovaps %zmm12, (%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rdx)
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm3, %zmm15
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm17
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm3, %zmm17
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm6, %xmm19
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm3, %zmm19
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm3, %zmm16
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 {%k1}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm27, %xmm20
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm3, %zmm20
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 {%k1}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm28, %xmm23
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm23, %zmm3, %zmm23
+; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm22 = ymm24[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],ymm24[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23]
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm22, %xmm22
+; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm18, %zmm18
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 448(%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 384(%rsi)
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm3, 320(%rsi)
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm3, 256(%rsi)
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm3, 192(%rsi)
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm3, 128(%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, (%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 256(%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rdx)
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm2, 192(%rdx)
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%rdx)
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 448(%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 256(%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 320(%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 256(%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8)
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm2, 384(%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 448(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 256(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 320(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 192(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 448(%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 256(%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 128(%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 64(%r8)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%r8)
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%r9)
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -23168,18 +23198,21 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax)
+; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax)
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 320(%rax)
-; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax)
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax)
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512BW-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8
+; AVX512BW-FCP-NEXT: addq $6792, %rsp # imm = 0x1A88
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -24164,932 +24197,949 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
;
; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf64:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT: subq $6792, %rsp # imm = 0x1A88
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm19
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm29
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm14
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm21
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm18
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm20
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm27
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm31
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm23
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm25
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm19
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm22
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm25
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm15
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm24 = [0,7,14,0,0,7,14,0]
+; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm24, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,13,4,5,6,13]
+; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 2816(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm24, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm24, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm10, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm25
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm10
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm11
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm17
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
-; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm24, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm12, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm28
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm24, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 2368(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm21
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm12, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm18
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm24, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 2816(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0]
-; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
-; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 1920(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm22
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm26
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm13, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3,4,5],ymm13[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm6, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm24, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm24, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm24, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm24, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm24, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm24, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm24, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm4, %zmm24
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm24
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm24, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm9, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [9,0,7,0,9,0,7,0]
+; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm10, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14]
+; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm12, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm16, %zmm2
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm26
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm24
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm12, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm13, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm9, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm19
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm13
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm19
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm20
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm22
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 2368(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm20
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 2432(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm9, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 1920(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm9, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 1984(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm1
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm23
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm1 # 64-byte Folded Reload
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm17
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm11
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm17, %zmm11, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm6
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm25
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm10, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm10, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm10, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm24
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm30
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm15, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 3328(%rdi), %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm11, %zmm1, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
-; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
-; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm16
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 2432(%rdi), %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 1984(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm10, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 3328(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm17, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm29 = [10,3,10,3,10,3,10,3]
+; AVX512DQ-BW-FCP-NEXT: # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm29, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm3 = [11,4,11,4,11,4,11,4]
+; AVX512DQ-BW-FCP-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm3, %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm19
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm20
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [12,5,12,5,12,5,12,5]
+; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm9, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm25 = [13,6,13,6,13,6,13,6]
+; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
-; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm25
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4]
-; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm27, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5]
-; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6]
-; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm29, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm25, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm20
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm29, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm3, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm25, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm1, %zmm20
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm29, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm31, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm29
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm13
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm29
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm8, %zmm23
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm9, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm27
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm31
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm29
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm26
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm30
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm29, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm3, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm25, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm31
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm29, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm19
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm19
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm31
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm29, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm5, %zmm29
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm25, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm4, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm31
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm26
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm24
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm23
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm14
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm25, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm27
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm22, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm22, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm3, %zmm22, %zmm25
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm27
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm28
; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm11
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm25
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm22
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm19
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm29
; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm27
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm18
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm9
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm13
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm20
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm31
; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm18
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm16
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm19
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25
; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm19
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm22
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm16
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm18
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm16
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm20
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm29
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm26
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm20
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm28
; AVX512DQ-BW-FCP-NEXT: movb $24, %al
-; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
+; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0]
-; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k2} = zmm1[4,5,4,5],zmm31[4,5,4,5]
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [14,0,0,7,14,0,0,7]
+; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm17, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [7,0,9,0,7,0,9,0]
+; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm0 = [0,0,4,11]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm1[4,5,4,5],zmm30[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm17, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 {%k2} = zmm1[4,5,4,5],zmm26[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm17, %zmm30
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm1[4,5,4,5],zmm24[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm17, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k2} = zmm1[4,5,4,5],zmm23[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm24
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm17, %zmm24
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm28
-; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm6, %zmm30
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5]
-; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm1[4,5,4,5],zmm10[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm17, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm14[4,5,4,5],zmm15[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm17, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm14, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm18, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm14, %zmm15, %zmm18
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm1[4,5,4,5],zmm2[4,5,4,5]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm15 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 # 64-byte Folded Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm15[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm10[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm4, %ymm0 # 32-byte Folded Reload
; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm27, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm28, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: movb $-32, %al
-; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
+; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm31 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm7 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm4 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqa 2752(%rdi), %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 2688(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [2,5]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm0, %ymm14, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm21, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm9, %ymm14, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm1, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm19 = ymm9[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm11, %ymm14, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm9, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 1408(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 1344(%rdi), %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm11[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm9, %ymm14, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm30, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 896(%rdi), %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm16 = ymm12[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm9, %ymm14, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 2304(%rdi), %ymm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %ymm21
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm27 = ymm21[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm21[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm9, %ymm14, %ymm21
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm21, %zmm24, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm9 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %ymm21
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %ymm22
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm28 = ymm22[8,9,10,11,12,13,14,15],ymm21[0,1,2,3,4,5,6,7],ymm22[24,25,26,27,28,29,30,31],ymm21[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %ymm21, %ymm14, %ymm22
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm13, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm21 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3200(%rdi), %ymm22
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %ymm24
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %ymm22, %ymm24, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm17, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm23, %zmm13
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 2304(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 1856(%rdi), %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 384(%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 320(%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 256(%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 128(%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rdx)
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm3, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm3, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm6, %xmm19
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm19, %zmm3, %zmm19
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm19 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm16, %zmm3, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm27, %xmm20
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm20, %zmm3, %zmm20
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm28, %xmm23
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm23, %zmm3, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm22 = ymm24[8,9,10,11,12,13,14,15],ymm22[0,1,2,3,4,5,6,7],ymm24[24,25,26,27,28,29,30,31],ymm22[16,17,18,19,20,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm22, %xmm22
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm22, %zmm18, %zmm18
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm18 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 448(%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 384(%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm3, 320(%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm3, 256(%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm3, 192(%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm3, 128(%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 448(%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 256(%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 192(%rdx)
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%rdx)
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 448(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 256(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 320(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 256(%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 384(%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 448(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 256(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 320(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 128(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 192(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 64(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 448(%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 256(%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 128(%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 192(%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 64(%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%r8)
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%r9)
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
@@ -25126,18 +25176,21 @@ define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax)
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 320(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
-; AVX512DQ-BW-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8
+; AVX512DQ-BW-FCP-NEXT: addq $6792, %rsp # imm = 0x1A88
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <448 x i64>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
index 51b6222077f82..25c1ece991afc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-8.ll
@@ -196,17 +196,18 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vmovaps 96(%rdi), %xmm2
; AVX512-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
; AVX512-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512-NEXT: vmovaps 112(%rdi), %xmm3
-; AVX512-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
-; AVX512-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
+; AVX512-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],mem[0]
+; AVX512-NEXT: vpbroadcastq 120(%rdi), %xmm4
+; AVX512-NEXT: valignq {{.*#+}} zmm4 = mem[7],zmm4[0,1,2,3,4,5,6]
; AVX512-NEXT: vmovaps %xmm5, (%rsi)
; AVX512-NEXT: vmovaps %xmm0, (%rdx)
; AVX512-NEXT: vmovaps %xmm6, (%rcx)
; AVX512-NEXT: vmovaps %xmm1, (%r8)
; AVX512-NEXT: vmovaps %xmm7, (%r9)
; AVX512-NEXT: vmovaps %xmm2, (%r11)
-; AVX512-NEXT: vmovaps %xmm8, (%r10)
-; AVX512-NEXT: vmovaps %xmm3, (%rax)
+; AVX512-NEXT: vmovaps %xmm3, (%r10)
+; AVX512-NEXT: vmovdqa %xmm4, (%rax)
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i64_stride8_vf2:
@@ -214,30 +215,34 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-FCP-NEXT: vmovaps 64(%rdi), %xmm0
-; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovaps 16(%rdi), %xmm2
-; AVX512-FCP-NEXT: vmovaps 32(%rdi), %xmm3
-; AVX512-FCP-NEXT: vmovaps 48(%rdi), %xmm4
-; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
-; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512-FCP-NEXT: vmovaps 80(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512-FCP-NEXT: vmovaps 96(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512-FCP-NEXT: vmovaps 64(%rdi), %xmm2
+; AVX512-FCP-NEXT: vmovaps (%rdi), %xmm3
+; AVX512-FCP-NEXT: vmovaps 16(%rdi), %xmm4
+; AVX512-FCP-NEXT: vmovaps 32(%rdi), %xmm5
+; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512-FCP-NEXT: vmovaps 112(%rdi), %xmm3
+; AVX512-FCP-NEXT: vmovaps 80(%rdi), %xmm3
; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
-; AVX512-FCP-NEXT: vmovaps %xmm5, (%rsi)
-; AVX512-FCP-NEXT: vmovaps %xmm0, (%rdx)
-; AVX512-FCP-NEXT: vmovaps %xmm6, (%rcx)
-; AVX512-FCP-NEXT: vmovaps %xmm1, (%r8)
-; AVX512-FCP-NEXT: vmovaps %xmm7, (%r9)
-; AVX512-FCP-NEXT: vmovaps %xmm2, (%r11)
-; AVX512-FCP-NEXT: vmovaps %xmm8, (%r10)
-; AVX512-FCP-NEXT: vmovaps %xmm3, (%rax)
+; AVX512-FCP-NEXT: vmovaps 96(%rdi), %xmm4
+; AVX512-FCP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm5[0],xmm4[0]
+; AVX512-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [0,14]
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15]
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512-FCP-NEXT: vmovaps %xmm7, (%rsi)
+; AVX512-FCP-NEXT: vmovaps %xmm2, (%rdx)
+; AVX512-FCP-NEXT: vmovaps %xmm8, (%rcx)
+; AVX512-FCP-NEXT: vmovaps %xmm3, (%r8)
+; AVX512-FCP-NEXT: vmovaps %xmm9, (%r9)
+; AVX512-FCP-NEXT: vmovaps %xmm4, (%r11)
+; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r10)
+; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rax)
+; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i64_stride8_vf2:
@@ -258,17 +263,18 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vmovaps 96(%rdi), %xmm2
; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
; AVX512DQ-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512DQ-NEXT: vmovaps 112(%rdi), %xmm3
-; AVX512DQ-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
-; AVX512DQ-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
+; AVX512DQ-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],mem[0]
+; AVX512DQ-NEXT: vpbroadcastq 120(%rdi), %xmm4
+; AVX512DQ-NEXT: valignq {{.*#+}} zmm4 = mem[7],zmm4[0,1,2,3,4,5,6]
; AVX512DQ-NEXT: vmovaps %xmm5, (%rsi)
; AVX512DQ-NEXT: vmovaps %xmm0, (%rdx)
; AVX512DQ-NEXT: vmovaps %xmm6, (%rcx)
; AVX512DQ-NEXT: vmovaps %xmm1, (%r8)
; AVX512DQ-NEXT: vmovaps %xmm7, (%r9)
; AVX512DQ-NEXT: vmovaps %xmm2, (%r11)
-; AVX512DQ-NEXT: vmovaps %xmm8, (%r10)
-; AVX512DQ-NEXT: vmovaps %xmm3, (%rax)
+; AVX512DQ-NEXT: vmovaps %xmm3, (%r10)
+; AVX512DQ-NEXT: vmovdqa %xmm4, (%rax)
+; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i64_stride8_vf2:
@@ -276,30 +282,34 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-FCP-NEXT: vmovaps 64(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovaps 16(%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vmovaps 48(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
-; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512DQ-FCP-NEXT: vmovaps 80(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512DQ-FCP-NEXT: vmovaps 96(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-FCP-NEXT: vmovaps 64(%rdi), %xmm2
+; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vmovaps 16(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT: vmovaps 32(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512DQ-FCP-NEXT: vmovaps 112(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vmovaps 80(%rdi), %xmm3
; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
-; AVX512DQ-FCP-NEXT: vmovaps %xmm5, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovaps %xmm6, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovaps %xmm1, (%r8)
-; AVX512DQ-FCP-NEXT: vmovaps %xmm7, (%r9)
-; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%r11)
-; AVX512DQ-FCP-NEXT: vmovaps %xmm8, (%r10)
-; AVX512DQ-FCP-NEXT: vmovaps %xmm3, (%rax)
+; AVX512DQ-FCP-NEXT: vmovaps 96(%rdi), %xmm4
+; AVX512DQ-FCP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm5[0],xmm4[0]
+; AVX512DQ-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [0,14]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-FCP-NEXT: vmovaps %xmm7, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovaps %xmm8, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovaps %xmm3, (%r8)
+; AVX512DQ-FCP-NEXT: vmovaps %xmm9, (%r9)
+; AVX512DQ-FCP-NEXT: vmovaps %xmm4, (%r11)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r10)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rax)
+; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i64_stride8_vf2:
@@ -320,17 +330,18 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovaps 96(%rdi), %xmm2
; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
; AVX512BW-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512BW-NEXT: vmovaps 112(%rdi), %xmm3
-; AVX512BW-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
-; AVX512BW-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
+; AVX512BW-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],mem[0]
+; AVX512BW-NEXT: vpbroadcastq 120(%rdi), %xmm4
+; AVX512BW-NEXT: valignq {{.*#+}} zmm4 = mem[7],zmm4[0,1,2,3,4,5,6]
; AVX512BW-NEXT: vmovaps %xmm5, (%rsi)
; AVX512BW-NEXT: vmovaps %xmm0, (%rdx)
; AVX512BW-NEXT: vmovaps %xmm6, (%rcx)
; AVX512BW-NEXT: vmovaps %xmm1, (%r8)
; AVX512BW-NEXT: vmovaps %xmm7, (%r9)
; AVX512BW-NEXT: vmovaps %xmm2, (%r11)
-; AVX512BW-NEXT: vmovaps %xmm8, (%r10)
-; AVX512BW-NEXT: vmovaps %xmm3, (%rax)
+; AVX512BW-NEXT: vmovaps %xmm3, (%r10)
+; AVX512BW-NEXT: vmovdqa %xmm4, (%rax)
+; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i64_stride8_vf2:
@@ -338,30 +349,34 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-FCP-NEXT: vmovaps 64(%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm1
-; AVX512BW-FCP-NEXT: vmovaps 16(%rdi), %xmm2
-; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %xmm3
-; AVX512BW-FCP-NEXT: vmovaps 48(%rdi), %xmm4
-; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
-; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512BW-FCP-NEXT: vmovaps 80(%rdi), %xmm1
-; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
-; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512BW-FCP-NEXT: vmovaps 96(%rdi), %xmm2
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512BW-FCP-NEXT: vmovaps 64(%rdi), %xmm2
+; AVX512BW-FCP-NEXT: vmovaps (%rdi), %xmm3
+; AVX512BW-FCP-NEXT: vmovaps 16(%rdi), %xmm4
+; AVX512BW-FCP-NEXT: vmovaps 32(%rdi), %xmm5
+; AVX512BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512BW-FCP-NEXT: vmovaps 112(%rdi), %xmm3
+; AVX512BW-FCP-NEXT: vmovaps 80(%rdi), %xmm3
; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
-; AVX512BW-FCP-NEXT: vmovaps %xmm5, (%rsi)
-; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rdx)
-; AVX512BW-FCP-NEXT: vmovaps %xmm6, (%rcx)
-; AVX512BW-FCP-NEXT: vmovaps %xmm1, (%r8)
-; AVX512BW-FCP-NEXT: vmovaps %xmm7, (%r9)
-; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%r11)
-; AVX512BW-FCP-NEXT: vmovaps %xmm8, (%r10)
-; AVX512BW-FCP-NEXT: vmovaps %xmm3, (%rax)
+; AVX512BW-FCP-NEXT: vmovaps 96(%rdi), %xmm4
+; AVX512BW-FCP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm5[0],xmm4[0]
+; AVX512BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [0,14]
+; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6
+; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15]
+; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512BW-FCP-NEXT: vmovaps %xmm7, (%rsi)
+; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%rdx)
+; AVX512BW-FCP-NEXT: vmovaps %xmm8, (%rcx)
+; AVX512BW-FCP-NEXT: vmovaps %xmm3, (%r8)
+; AVX512BW-FCP-NEXT: vmovaps %xmm9, (%r9)
+; AVX512BW-FCP-NEXT: vmovaps %xmm4, (%r11)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r10)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%rax)
+; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i64_stride8_vf2:
@@ -382,17 +397,18 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vmovaps 96(%rdi), %xmm2
; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512DQ-BW-NEXT: vmovaps 112(%rdi), %xmm3
-; AVX512DQ-BW-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
-; AVX512DQ-BW-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
+; AVX512DQ-BW-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],mem[0]
+; AVX512DQ-BW-NEXT: vpbroadcastq 120(%rdi), %xmm4
+; AVX512DQ-BW-NEXT: valignq {{.*#+}} zmm4 = mem[7],zmm4[0,1,2,3,4,5,6]
; AVX512DQ-BW-NEXT: vmovaps %xmm5, (%rsi)
; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rdx)
; AVX512DQ-BW-NEXT: vmovaps %xmm6, (%rcx)
; AVX512DQ-BW-NEXT: vmovaps %xmm1, (%r8)
; AVX512DQ-BW-NEXT: vmovaps %xmm7, (%r9)
; AVX512DQ-BW-NEXT: vmovaps %xmm2, (%r11)
-; AVX512DQ-BW-NEXT: vmovaps %xmm8, (%r10)
-; AVX512DQ-BW-NEXT: vmovaps %xmm3, (%rax)
+; AVX512DQ-BW-NEXT: vmovaps %xmm3, (%r10)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rax)
+; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i64_stride8_vf2:
@@ -400,30 +416,34 @@ define void @load_i64_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-BW-FCP-NEXT: vmovaps 64(%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT: vmovaps 16(%rdi), %xmm2
-; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT: vmovaps 48(%rdi), %xmm4
-; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm5 = xmm1[0],xmm0[0]
-; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
-; AVX512DQ-BW-FCP-NEXT: vmovaps 80(%rdi), %xmm1
-; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm6 = xmm2[0],xmm1[0]
-; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
-; AVX512DQ-BW-FCP-NEXT: vmovaps 96(%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovaps 64(%rdi), %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %xmm3
+; AVX512DQ-BW-FCP-NEXT: vmovaps 16(%rdi), %xmm4
+; AVX512DQ-BW-FCP-NEXT: vmovaps 32(%rdi), %xmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm7 = xmm3[0],xmm2[0]
; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
-; AVX512DQ-BW-FCP-NEXT: vmovaps 112(%rdi), %xmm3
+; AVX512DQ-BW-FCP-NEXT: vmovaps 80(%rdi), %xmm3
; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm8 = xmm4[0],xmm3[0]
; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
-; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm5, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm6, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm1, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm7, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%r11)
-; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm8, (%r10)
-; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm3, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovaps 96(%rdi), %xmm4
+; AVX512DQ-BW-FCP-NEXT: vmovlhps {{.*#+}} xmm9 = xmm5[0],xmm4[0]
+; AVX512DQ-BW-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [0,14]
+; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm5, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm5 = [7,15]
+; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm7, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm8, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm3, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm9, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm4, (%r11)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r10)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <16 x i64>, ptr %in.vec, align 64
%strided.vec0 = shufflevector <16 x i64> %wide.vec, <16 x i64> poison, <2 x i32> <i32 0, i32 8>
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
index d1d7cb0a34332..592f67c72f1d4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
@@ -2110,6 +2110,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX-LABEL: load_i8_stride3_vf64:
; AVX: # %bb.0:
+; AVX-NEXT: pushq %rax
; AVX-NEXT: vmovdqa (%rdi), %xmm6
; AVX-NEXT: vmovdqa 16(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2119,6 +2120,7 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 96(%rdi), %xmm11
; AVX-NEXT: vmovdqa 112(%rdi), %xmm3
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 144(%rdi), %xmm10
; AVX-NEXT: vmovdqa 160(%rdi), %xmm1
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2171,55 +2173,56 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX-NEXT: vpor %xmm0, %xmm8, %xmm2
; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm5, %xmm1
+; AVX-NEXT: vpor %xmm1, %xmm5, %xmm3
; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7,8,9,10]
; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm15
-; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14]
-; AVX-NEXT: vpshufb %xmm0, %xmm14, %xmm14
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14]
+; AVX-NEXT: vpshufb %xmm1, %xmm14, %xmm14
; AVX-NEXT: vpor %xmm14, %xmm15, %xmm14
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm6, %xmm15, %xmm15
-; AVX-NEXT: vpshufb %xmm0, %xmm10, %xmm10
-; AVX-NEXT: vpor %xmm10, %xmm15, %xmm10
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm6, %xmm15, %xmm15
-; AVX-NEXT: vpshufb %xmm0, %xmm12, %xmm12
-; AVX-NEXT: vpor %xmm12, %xmm15, %xmm12
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm6, %xmm15, %xmm6
-; AVX-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; AVX-NEXT: vpor %xmm0, %xmm6, %xmm0
+; AVX-NEXT: vpmovsxdq {{.*#+}} xmm15 = [18446744073709551615,16777215]
+; AVX-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm10
+; AVX-NEXT: vpor %xmm0, %xmm10, %xmm0
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm6, %xmm10, %xmm6
+; AVX-NEXT: vpshufb %xmm1, %xmm12, %xmm10
+; AVX-NEXT: vpor %xmm6, %xmm10, %xmm6
+; AVX-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm10 # 16-byte Folded Reload
+; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX-NEXT: vpor %xmm1, %xmm10, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm6
-; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm7, %xmm15, %xmm15
-; AVX-NEXT: vpor %xmm6, %xmm15, %xmm6
+; AVX-NEXT: vpshufb %xmm4, %xmm7, %xmm7
+; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm10, %xmm12, %xmm12
+; AVX-NEXT: vpor %xmm7, %xmm12, %xmm7
; AVX-NEXT: vpshufb %xmm4, %xmm11, %xmm11
-; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm15
-; AVX-NEXT: vpor %xmm15, %xmm11, %xmm11
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm10, %xmm12, %xmm12
+; AVX-NEXT: vpor %xmm12, %xmm11, %xmm11
; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm3
-; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm3
-; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rsi)
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm10, %xmm12, %xmm12
+; AVX-NEXT: vpor %xmm2, %xmm12, %xmm2
+; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm10, %xmm4, %xmm4
+; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vmovdqa %xmm3, (%rsi)
; AVX-NEXT: vmovdqa %xmm2, 48(%rsi)
; AVX-NEXT: vmovdqa %xmm11, 32(%rsi)
-; AVX-NEXT: vmovdqa %xmm6, 16(%rsi)
-; AVX-NEXT: vmovdqa %xmm0, (%rdx)
-; AVX-NEXT: vmovdqa %xmm12, 48(%rdx)
-; AVX-NEXT: vmovdqa %xmm10, 32(%rdx)
+; AVX-NEXT: vmovdqa %xmm7, 16(%rsi)
+; AVX-NEXT: vmovdqa %xmm1, (%rdx)
+; AVX-NEXT: vmovdqa %xmm6, 48(%rdx)
+; AVX-NEXT: vmovdqa %xmm0, 32(%rdx)
; AVX-NEXT: vmovdqa %xmm14, 16(%rdx)
; AVX-NEXT: vmovdqa %xmm5, (%rcx)
; AVX-NEXT: vmovdqa %xmm8, 48(%rcx)
; AVX-NEXT: vmovdqa %xmm9, 32(%rcx)
; AVX-NEXT: vmovdqa %xmm13, 16(%rcx)
+; AVX-NEXT: popq %rax
; AVX-NEXT: retq
;
; AVX2-LABEL: load_i8_stride3_vf64:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index ac14f55e3f0ed..11e2509795ad3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -6617,8 +6617,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u]
; AVX512-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm19) | ymm7
; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26
; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8
; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm11
@@ -6629,31 +6629,33 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm12
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5]
-; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15
+; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12
; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11
-; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm12
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ~ymm16) | ymm11
+; AVX512-FCP-NEXT: vmovdqa 144(%rdi), %xmm11
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm6
; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %xmm13
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512-FCP-NEXT: vpor %xmm6, %xmm14, %xmm6
; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm6
; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10))
+; AVX512-FCP-NEXT: vpandnq %zmm6, %zmm21, %zmm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm10 & zmm21) | zmm6
; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm14
-; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm11
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm10
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0
+; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm12
+; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm15
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm14 ^ (ymm15 & (ymm12 ^ ymm14))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero
-; AVX512-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,4,9,14],zero,zero,zero,xmm15[2,7,12],zero,zero,zero
+; AVX512-FCP-NEXT: vpor %xmm0, %xmm15, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [8,9,10,11,12,21,22,23]
+; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm15
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm10, %zmm20
; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12))
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
@@ -6684,15 +6686,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm19) | ymm3
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2))
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18
; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12))
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
@@ -6720,15 +6722,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19
; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm12 ^ ymm14))
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
@@ -6757,16 +6759,16 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3))
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm14 ^ (ymm4 & (ymm12 ^ ymm14))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -6794,9 +6796,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1))
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rsi)
; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
; AVX512-FCP-NEXT: vzeroupper
@@ -7027,8 +7029,8 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,zero,xmm9[4,9,14],zero,zero,zero,xmm9[2,7,12,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,5,10,15],zero,zero,zero,xmm8[3,8,13],zero,zero,zero,xmm8[u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm20) | ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm19) | ymm7
; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm26
; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm8
; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm11
@@ -7039,31 +7041,33 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm12
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,5,10,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,25,30,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm17 = [0,5,0,5,0,5,0,5]
-; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm15
+; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12
; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = (ymm15 & ~ymm16) | ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ~ymm16) | ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa 144(%rdi), %xmm11
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm6
; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %xmm13
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm11, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm14, %xmm6
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm6, %zmm6
; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm21 & (zmm6 ^ zmm10))
+; AVX512DQ-FCP-NEXT: vpandnq %zmm6, %zmm21, %zmm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = (zmm10 & zmm21) | zmm6
; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm10
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm14 ^ (ymm10 & (ymm11 ^ ymm14))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm15
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm14 ^ (ymm15 & (ymm12 ^ ymm14))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u],zero,zero,zero,xmm0[3,8,13],zero,zero,zero,xmm0[1,6,11]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,4,9,14],zero,zero,zero,xmm10[2,7,12],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm10, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,4,9,14],zero,zero,zero,xmm15[2,7,12],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm15, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm19
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [8,9,10,11,12,21,22,23]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm6, %zmm15
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm10, %zmm20
; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12))
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm6
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,0,5,10,15],zero,zero,zero,xmm0[3,8,13],zero,zero,zero
@@ -7094,15 +7098,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[0,5,10,15],zero,zero,zero,xmm2[3,8,13,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm20) | ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm19) | ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm2))
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm18
; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm11 ^ (ymm0 & (ymm14 ^ ymm11))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm14 ^ ymm12))
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,u,1,6,11],zero,zero,zero,zero,xmm0[4,9,14],zero,zero,zero
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero,xmm0[3,8,13]
@@ -7130,15 +7134,15 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15],zero,zero,zero,xmm3[u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm20) | ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm19) | ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm21 & (zmm0 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm20
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19
; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm11 ^ ymm14))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm12 ^ ymm14))
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,zero,xmm1[1,6,11],zero,zero,zero,zero,xmm1[4,9,14]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,2,7,12],zero,zero,zero,xmm0[0,5,10,15],zero,zero,zero
@@ -7167,16 +7171,16 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & mem) | ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm13[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vpmovsxwq {{.*#+}} zmm2 = [0,0,0,18446744073709551360,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm2 & (zmm0 ^ zmm3))
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm4 & (ymm11 ^ ymm14))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,u,3,8,13],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm14 ^ (ymm4 & (ymm12 ^ ymm14))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u,u,3,8,13],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,zero,xmm3[2,7,12],zero,zero,zero,xmm3[0,5,10,15]
; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -7204,9 +7208,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm2 & (zmm4 ^ zmm1))
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
@@ -7472,52 +7476,52 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: movl $127, %eax
; AVX512BW-FCP-NEXT: kmovd %eax, %k4
; AVX512BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4}
-; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11]
-; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8
+; AVX512BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm10
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11]
+; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm11
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5}
-; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10
; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8
; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7
-; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20
+; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm12 {%k2}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u],zero,zero,zero,xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,4,9,14],zero,zero,zero,xmm12[2,7,12],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [8,9,10,11,12,21,22,23]
+; AVX512BW-FCP-NEXT: vpermi2d %zmm12, %zmm9, %zmm13
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm20
; AVX512BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A
; AVX512BW-FCP-NEXT: kmovd %eax, %k3
-; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3}
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm12 {%k3}
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
; AVX512BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000
; AVX512BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6}
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k6}
; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2}
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm12 {%k2}
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
; AVX512BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108
; AVX512BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm12
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[1,6,11],zero,zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16
; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm11[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16
; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15
@@ -7551,12 +7555,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm12[2,7,12],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4
; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm11[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4
@@ -7590,26 +7594,26 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm12[3,8,13],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9
; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4
; AVX512BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF
; AVX512BW-FCP-NEXT: kmovq %rax, %k5
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5}
; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9
-; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm10 {%k3}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero,xmm11[4,9,14]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm10, %ymm9 {%k4}
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2}
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
@@ -7630,7 +7634,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[4,9,14],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
@@ -7916,52 +7920,52 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: movl $127, %eax
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm7 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[1,6,11]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm12[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 144(%rdi), %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[1,6,11]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,u,u,u,u,u,2,7,12],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm8, %xmm12, %xmm8
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm9 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm10
; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm8
; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm13 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u],zero,zero,zero,xmm14[3,8,13],zero,zero,zero,xmm14[1,6,11]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,4,9,14],zero,zero,zero,xmm13[2,7,12],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm13[5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm20
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u],zero,zero,zero,xmm13[3,8,13],zero,zero,zero,xmm13[1,6,11]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,4,9,14],zero,zero,zero,xmm12[2,7,12],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [8,9,10,11,12,21,22,23]
+; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm12, %zmm9, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm20
; AVX512DQ-BW-FCP-NEXT: movw $10570, %ax # imm = 0x294A
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm10 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm0, %ymm1, %ymm12 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
; AVX512DQ-BW-FCP-NEXT: movl $-2078212096, %eax # imm = 0x84210000
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k6}
; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[1,6,11],zero,zero,zero,zero,xmm13[4,9,14],zero,zero,zero,xmm13[u,u,u]
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[0,5,10,15],zero,zero,zero,xmm13[3,8,13,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm10 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm10[2,3,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,2,7,12,17,22,27,16,21,26,31,20,25,30,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm12[2,3,0,1]
; AVX512DQ-BW-FCP-NEXT: movl $8456, %eax # imm = 0x2108
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm10 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm10[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[1,6,11],zero,zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm12[u,u,u,u,u,u,u,4,9,14,3,8,13,2,7,12,17,22,27,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %xmm12
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm12[1,6,11],zero,zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vmovdqa 176(%rdi), %xmm13
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[0,5,10,15,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[2,7,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm11[u,u,u,u,u,u,u,u,u,u,3,8,13],zero,zero,zero
; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm17, %xmm16
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm16, %zmm15
@@ -7995,12 +7999,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm15 {%k6}
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,0,5,10,15,4,9,14,3,8,13,18,23,28,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[1,6,11,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[2,7,12],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm12[2,7,12],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm4
; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm15[3,4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[3,8,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm12[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm10[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm10[3,8,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm11[u,u,u,u,u,u,u,u,u,u,4,9,14],zero,zero,zero
; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4
@@ -8034,26 +8038,26 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm4 {%k5}
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,1,6,11,0,5,10,15,4,9,14,19,24,29,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,zero,xmm13[2,7,12,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm10[3,8,13],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm12[3,8,13],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm18, %xmm9
; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm4[3,4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm11[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm11[4,9,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm12[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm10[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,xmm10[4,9,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm11[u,u,u,u,u,u,u,u,u,0,5,10,15],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm9, %zmm4
; AVX512DQ-BW-FCP-NEXT: movl $33554431, %eax # imm = 0x1FFFFFF
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm4 {%k5}
; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm4, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm11 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,zero,xmm12[1,6,11],zero,zero,zero,zero,xmm12[4,9,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[0,5,10,15],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm7, %ymm10 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,zero,xmm11[1,6,11],zero,zero,zero,zero,xmm11[4,9,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,2,7,12],zero,zero,zero,xmm10[0,5,10,15],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm10, %ymm9 {%k4}
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k2}
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
@@ -8074,7 +8078,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm1, %ymm5 {%k2}
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,2,7,12,1,6,11,0,5,10,15,20,25,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm13[3,8,13,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[4,9,14],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[4,9,14],zero,zero,zero,xmm12[u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm1[3,4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 5ab09194c5b83..489433aeaf16e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -3297,99 +3297,104 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4
-; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122
-; AVX512BW-FCP-NEXT: kmovd %r11d, %k1
-; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
-; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
-; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
+; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3
+; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512BW-FCP-NEXT: movw $-28382, %di # imm = 0x9122
+; AVX512BW-FCP-NEXT: kmovd %edi, %k1
+; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k1}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,1,10,3,4,5,6,7]
+; AVX512BW-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm5
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
+; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
+; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
+; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
; AVX512BW-FCP-NEXT: movw $4644, %di # imm = 0x1224
; AVX512BW-FCP-NEXT: kmovd %edi, %k2
-; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [0,17,2,3,4,21,6,7]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
; AVX512BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00
; AVX512BW-FCP-NEXT: kmovd %edi, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm5 {%k1}
; AVX512BW-FCP-NEXT: movw $8772, %di # imm = 0x2244
; AVX512BW-FCP-NEXT: kmovd %edi, %k3
-; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k3}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [0,1,18,3,4,21,6,7]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm7[4,11]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
-; AVX512BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1}
; AVX512BW-FCP-NEXT: movw $9288, %di # imm = 0x2448
; AVX512BW-FCP-NEXT: kmovd %edi, %k4
-; AVX512BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
+; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm7 {%k4}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [0,9,2,11]
+; AVX512BW-FCP-NEXT: vpermd %zmm1, %zmm8, %zmm8
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
+; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm7 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k2}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
+; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm9 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm8 {%k3}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm10
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [16,1,2,19,4,5,6,23]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm10, %zmm10
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm10[0,7,14]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
; AVX512BW-FCP-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9
-; AVX512BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
-; AVX512BW-FCP-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k4}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [16,1,2,3,20,5,6,23]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm1
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX512BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm10, (%r9)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%r10)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%rax)
+; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
@@ -3497,99 +3502,104 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,5,12,128,128,1,8,15,128,128,4,11,128,128]
; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122
-; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0,1,2,3,4],xmm4[5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm7[5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512DQ-BW-FCP-NEXT: movw $-28382, %di # imm = 0x9122
+; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4],xmm5[5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,1,10,3,4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm1, %zmm5, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,7,14,128,128,3,10,128,128,128,6,13,128,128,2,9]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
; AVX512DQ-BW-FCP-NEXT: movw $4644, %di # imm = 0x1224
; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k2
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm6 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm5 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[6,13],zero,zero,xmm6[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [0,17,2,3,4,21,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm4[0],xmm3[1],xmm4[2,3,4],xmm3[5],xmm4[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
; AVX512DQ-BW-FCP-NEXT: movw $-512, %di # imm = 0xFE00
; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm6, %xmm5 {%k1}
; AVX512DQ-BW-FCP-NEXT: movw $8772, %di # imm = 0x2244
; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k3
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm7 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [0,1,18,3,4,21,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm7[4,11]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm0[6,13],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm7 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm7, %xmm6 {%k1}
; AVX512DQ-BW-FCP-NEXT: movw $9288, %di # imm = 0x2448
; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k4
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm1, %ymm2, %ymm8 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm7 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [0,9,2,11]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm1, %zmm8, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm7 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm9 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm8, %xmm9 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm8 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [16,1,2,19,4,5,6,23]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm10, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm10[0,7,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm10, %xmm8 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm10 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm9, %xmm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm1, %ymm9 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm4[1,2],xmm3[3],xmm4[4,5,6],xmm3[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,4,11],zero,zero,xmm11[0,7,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm11, %xmm9 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm2, %ymm1 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6],xmm3[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [16,1,2,3,20,5,6,23]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u,u,u,u,5,12],zero,zero,xmm1[1,8,15]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u],zero,zero,xmm0[3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm10, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%r10)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm0, %xmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <112 x i8>, ptr %in.vec, align 64
@@ -5629,202 +5639,207 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2
+; AVX512-NEXT: vmovdqa 128(%rdi), %ymm1
; AVX512-NEXT: vmovdqa 160(%rdi), %ymm3
-; AVX512-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2))
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm3 ^ ymm1))
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512-NEXT: vpor %xmm4, %xmm1, %xmm1
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vmovdqa 192(%rdi), %xmm4
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
-; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm6
-; AVX512-NEXT: vmovdqa64 %xmm5, %xmm20
-; AVX512-NEXT: vmovdqa 208(%rdi), %xmm5
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5,6],ymm6[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512-NEXT: vmovdqa (%rdi), %ymm6
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm7
-; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1
-; AVX512-NEXT: vmovdqa %ymm14, %ymm9
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm7 ^ (ymm9 & (ymm6 ^ ymm7))
-; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm10
-; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm10, %xmm9, %xmm13
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT: vmovdqa 96(%rdi), %ymm9
-; AVX512-NEXT: vmovdqa %ymm11, %ymm15
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1))
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm10
-; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm13 & mem)
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u]
+; AVX512-NEXT: vpor %xmm4, %xmm2, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vmovdqa 192(%rdi), %xmm5
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
+; AVX512-NEXT: vpshufb %xmm6, %xmm5, %xmm4
+; AVX512-NEXT: vmovdqa64 %xmm6, %xmm19
+; AVX512-NEXT: vmovdqa 208(%rdi), %xmm6
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm4
+; AVX512-NEXT: vmovdqa 64(%rdi), %ymm7
+; AVX512-NEXT: vmovdqa %ymm15, %ymm8
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm2 ^ ymm4))
+; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[5,12],zero,zero,xmm9[1,8,15,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,7,14],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm9, %xmm8, %xmm12
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT: vmovdqa 96(%rdi), %ymm10
+; AVX512-NEXT: vmovdqa %ymm9, %ymm13
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm7 ^ (ymm13 & (ymm10 ^ ymm7))
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm8
+; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7,8,9],ymm8[10],ymm13[11,12],ymm8[13],ymm13[14,15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm12 & mem)
; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm16 & (ymm8 ^ ymm12))
-; AVX512-NEXT: vmovdqa64 %ymm8, %ymm18
-; AVX512-NEXT: vmovdqa %ymm11, %ymm12
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm3 ^ ymm2))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12
-; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u]
-; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm16 & (ymm13 ^ ymm11))
+; AVX512-NEXT: vmovdqa64 %ymm13, %ymm18
+; AVX512-NEXT: vmovdqa %ymm9, %ymm11
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm3 ^ ymm1))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
+; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
+; AVX512-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
-; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512-NEXT: vmovdqa %ymm13, %ymm12
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm7 ^ (ymm12 & (ymm6 ^ ymm7))
-; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm15
-; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm15, %xmm12, %xmm15
-; AVX512-NEXT: vmovdqa %ymm14, %ymm12
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm9 ^ (ymm12 & (ymm1 ^ ymm9))
-; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7,8,9],ymm10[10],ymm12[11,12,13],ymm10[14],ymm12[15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm15 & ~mem)
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512-NEXT: vmovdqa64 %ymm12, %ymm19
-; AVX512-NEXT: vmovdqa %ymm0, %ymm8
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8
-; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm15, %xmm8, %xmm8
-; AVX512-NEXT: vmovdqa %ymm13, %ymm15
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm9 ^ (ymm15 & (ymm1 ^ ymm9))
-; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512-NEXT: vmovdqa %ymm12, %ymm11
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm4 ^ (ymm11 & (ymm2 ^ ymm4))
+; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm13, %xmm11, %xmm13
+; AVX512-NEXT: vmovdqa %ymm15, %ymm11
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm7 ^ ymm10))
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4,5],ymm8[6],ymm11[7,8,9],ymm8[10],ymm11[11,12,13],ymm8[14],ymm11[15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm13 & ~mem)
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm16 & (ymm11 ^ ymm14))
+; AVX512-NEXT: vmovdqa %ymm12, %ymm8
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm7 ^ ymm10))
+; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7,8,9,10],ymm13[11],ymm8[12,13],ymm13[14],ymm8[15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vmovdqa %ymm0, %ymm13
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm4 ^ (ymm13 & (ymm2 ^ ymm4))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm14, %xmm13, %xmm13
; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17)
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm2 ^ ymm3))
-; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm8
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = (ymm13 & ymm17) | ymm8
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm3 ^ (ymm15 & (ymm1 ^ ymm3))
+; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm8
; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u]
; AVX512-NEXT: vpor %xmm8, %xmm14, %xmm8
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm12
-; AVX512-NEXT: vpshufb %xmm12, %xmm5, %xmm14
-; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
-; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm8, %ymm20
-; AVX512-NEXT: vmovdqa %ymm13, %ymm8
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
-; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm12
-; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm14
+; AVX512-NEXT: vpshufb %xmm14, %xmm6, %xmm14
+; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm14[7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4,5,6,7],ymm13[8],ymm8[9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm8, %ymm19
+; AVX512-NEXT: vmovdqa %ymm12, %ymm8
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm1 ^ ymm3))
+; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm13
+; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8
+; AVX512-NEXT: vpor %xmm13, %xmm8, %xmm8
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12
-; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[5,12]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512-NEXT: vpor %xmm13, %xmm14, %xmm13
+; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512-NEXT: vmovdqa %ymm11, %ymm8
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8
-; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm14, %xmm8, %xmm8
-; AVX512-NEXT: vmovdqa %ymm0, %ymm14
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9))
-; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2],ymm10[3],ymm14[4,5,6],ymm10[7,8],ymm14[9,10],ymm10[11],ymm14[12,13,14],ymm10[15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17)
-; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm8))
+; AVX512-NEXT: vmovdqa %ymm0, %ymm8
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm7 ^ ymm10))
+; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm8[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1,2],ymm14[3],ymm8[4,5,6],ymm14[7,8],ymm8[9,10],ymm14[11],ymm8[12,13,14],ymm14[15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vmovdqa %ymm9, %ymm14
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm4 ^ (ymm14 & (ymm2 ^ ymm4))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm14
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm17) | ymm8
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm8, %ymm21
+; AVX512-NEXT: vmovdqa64 %ymm8, %ymm20
; AVX512-NEXT: vmovdqa %ymm0, %ymm8
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm1 ^ ymm3))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8
; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8
+; AVX512-NEXT: vpor %xmm13, %xmm8, %xmm8
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512-NEXT: vpor %xmm12, %xmm14, %xmm12
-; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512-NEXT: vmovdqa %ymm13, %ymm8
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6))
-; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm14
-; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm14, %xmm8, %xmm8
-; AVX512-NEXT: vmovdqa %ymm11, %ymm14
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9))
-; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3],ymm10[4],ymm14[5,6],ymm10[7,8],ymm14[9,10,11],ymm10[12],ymm14[13,14],ymm10[15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17)
-; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT: vmovdqa %ymm11, %ymm8
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[6,13]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512-NEXT: vpor %xmm13, %xmm14, %xmm13
+; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm8))
+; AVX512-NEXT: vmovdqa %ymm9, %ymm8
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm7 ^ ymm10))
+; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm8[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1,2,3],ymm14[4],ymm8[5,6],ymm14[7,8],ymm8[9,10,11],ymm14[12],ymm8[13,14],ymm14[15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vmovdqa %ymm12, %ymm14
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm4 ^ ymm2))
+; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm17) | ymm8
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm8, %ymm21
+; AVX512-NEXT: vmovdqa %ymm9, %ymm8
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm1 ^ ymm3))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8
; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm12, %xmm8, %xmm8
+; AVX512-NEXT: vpor %xmm13, %xmm8, %xmm8
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14]
-; AVX512-NEXT: vpor %xmm12, %xmm15, %xmm12
-; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512-NEXT: vmovdqa %ymm0, %ymm8
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6))
-; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm15
-; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm15, %xmm8, %xmm8
-; AVX512-NEXT: vmovdqa %ymm13, %ymm15
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1))
-; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13,14,15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17)
-; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm3 ^ ymm2))
-; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15]
-; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[0,7,14]
+; AVX512-NEXT: vpor %xmm13, %xmm15, %xmm13
+; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm8))
+; AVX512-NEXT: vmovdqa %ymm12, %ymm8
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm10 ^ ymm7))
+; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6,7,8],ymm15[9],ymm8[10,11],ymm15[12],ymm8[13,14,15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vmovdqa %ymm0, %ymm15
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm2 ^ (ymm15 & (ymm4 ^ ymm2))
+; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm14
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[5,12],zero,zero,xmm15[1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm14, %xmm15, %xmm14
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm17) | ymm8
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm1 ^ (ymm12 & (ymm3 ^ ymm1))
+; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm1
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[1,8,15]
+; AVX512-NEXT: vpor %xmm3, %xmm5, %xmm3
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2))
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm11 & (ymm7 ^ ymm6))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
-; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm4
-; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm2, %xmm4, %xmm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm9 ^ ymm1))
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm1))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm0 & (ymm10 ^ ymm7))
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17)
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm2 ^ (ymm9 & (ymm4 ^ ymm2))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm17) | ymm0
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vmovdqa64 %ymm18, (%rsi)
-; AVX512-NEXT: vmovdqa64 %ymm19, (%rdx)
-; AVX512-NEXT: vmovdqa64 %ymm20, (%rcx)
-; AVX512-NEXT: vmovdqa64 %ymm21, (%r8)
-; AVX512-NEXT: vmovdqa %ymm14, (%r9)
+; AVX512-NEXT: vmovdqa %ymm11, (%rdx)
+; AVX512-NEXT: vmovdqa64 %ymm19, (%rcx)
+; AVX512-NEXT: vmovdqa64 %ymm20, (%r8)
+; AVX512-NEXT: vmovdqa64 %ymm21, (%r9)
; AVX512-NEXT: vmovdqa %ymm8, (%r10)
; AVX512-NEXT: vmovdqa %ymm0, (%rax)
; AVX512-NEXT: vzeroupper
@@ -5835,198 +5850,202 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm3
+; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm4
; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm4 ^ ymm3))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm12
-; AVX512-FCP-NEXT: vpermd %ymm12, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
-; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm10
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm7
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm11
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm7 ^ ymm1))
-; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm8
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm10 & mem)
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6]
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
+; AVX512-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm18
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm5
+; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm18 ^ ymm2))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,7,14],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm6, %xmm10
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm8
+; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm8 ^ ymm5))
+; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm13
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm10 & mem)
; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm16 & (ymm11 ^ ymm6))
-; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm18
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm3 ^ ymm2))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (ymm16 & (ymm6 ^ ymm9))
+; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm9
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm3 ^ (ymm9 & (ymm4 ^ ymm3))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6]
-; AVX512-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10
+; AVX512-FCP-NEXT: vpermd %ymm11, %ymm10, %ymm10
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm10
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm4 ^ ymm5))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3,4,5,6],ymm10[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm9
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm2 ^ (ymm9 & (ymm18 ^ ymm2))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm14
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm14, %xmm10, %xmm14
-; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm10
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm7 ^ (ymm10 & (ymm1 ^ ymm7))
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm14 & ~mem)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm16 & (ymm10 ^ ymm6))
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm14, %xmm6
-; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm14
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm1 ^ ymm7))
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm6 & ymm17)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm2 ^ ymm3))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm14, %xmm9, %xmm14
+; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm9
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (ymm9 & (ymm5 ^ ymm8))
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm13[2],ymm9[3,4,5],ymm13[6],ymm9[7,8,9],ymm13[10],ymm9[11,12,13],ymm13[14],ymm9[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm14 & ~mem)
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm15 ^ (ymm16 & (ymm9 ^ ymm15))
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm13
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm5 ^ ymm8))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8,9,10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm14
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm18 ^ ymm2))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm16) | ymm13
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm4 ^ (ymm12 & (ymm3 ^ ymm4))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6]
-; AVX512-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6,7],ymm14[8],ymm6[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm19
-; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12
+; AVX512-FCP-NEXT: vpermd %ymm11, %ymm13, %ymm11
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6,7],ymm14[8],ymm11[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm19
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm11
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm4 ^ (ymm11 & (ymm3 ^ ymm4))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm12, %xmm6
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm14
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm12
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm12
+; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm13
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[5,12]
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm11
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm17 & (ymm14 ^ ymm12))
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm12
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm12 & (ymm5 ^ ymm8))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm12[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1,2],ymm15[3],ymm12[4,5,6],ymm15[7,8],ymm12[9,10],ymm15[11],ymm12[12,13,14],ymm15[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm15
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm2 ^ (ymm15 & (ymm18 ^ ymm2))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm1, %xmm15, %xmm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm16) | ymm12
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0],ymm14[1,2,3,4,5,6,7],ymm1[8],ymm14[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm3 ^ ymm4))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm1, %xmm12, %xmm1
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[6,13]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512-FCP-NEXT: vpor %xmm12, %xmm15, %xmm12
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm17 & (ymm12 ^ ymm1))
+; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm5 ^ ymm8))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3],ymm15[4],ymm1[5,6],ymm15[7,8],ymm1[9,10,11],ymm15[12],ymm1[13,14],ymm15[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm15
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm18 ^ (ymm15 & (ymm2 ^ ymm18))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm14
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm16) | ymm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm21
+; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm3 ^ ymm4))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm1, %xmm14, %xmm1
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm13[0,7,14]
+; AVX512-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm17 & (ymm14 ^ ymm1))
+; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm5 ^ (ymm1 & (ymm8 ^ ymm5))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6,7,8],ymm15[9],ymm1[10,11],ymm15[12],ymm1[13,14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm15
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7))
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4,5,6],ymm8[7,8],ymm15[9,10],ymm8[11],ymm15[12,13,14],ymm8[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm20
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
-; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm15
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7))
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm21
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
-; AVX512-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
-; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm15
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm7 ^ ymm1))
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm2 ^ (ymm11 & (ymm3 ^ ymm2))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm18 ^ (ymm15 & (ymm2 ^ ymm18))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm12
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[5,12],zero,zero,xmm15[1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm12, %xmm15, %xmm12
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm16) | ymm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm14[1,2,3,4,5,6,7],ymm12[8],ymm14[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm4 ^ ymm3))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm13[1,8,15]
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm7 ^ ymm1))
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7,8],ymm8[9],ymm0[10,11,12],ymm8[13],ymm0[14,15]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm17 & (ymm3 ^ ymm1))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm5 ^ (ymm0 & (ymm8 ^ ymm5))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17)
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa %ymm10, (%rdx)
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm18 ^ (ymm7 & (ymm2 ^ ymm18))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm16) | ymm0
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rsi)
+; AVX512-FCP-NEXT: vmovdqa %ymm9, (%rdx)
; AVX512-FCP-NEXT: vmovdqa64 %ymm19, (%rcx)
; AVX512-FCP-NEXT: vmovdqa64 %ymm20, (%r8)
; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%r9)
-; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r10)
+; AVX512-FCP-NEXT: vmovdqa %ymm15, (%r10)
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
@@ -6036,202 +6055,207 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2
+; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm1
; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm3
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm1 ^ (ymm2 & (ymm3 ^ ymm1))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm4
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm4, %xmm1, %xmm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm4
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm6
-; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm20
-; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm5
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm6
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm7
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1
-; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm9
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm7 ^ (ymm9 & (ymm6 ^ ymm7))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm10
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm10, %xmm9, %xmm13
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm9
-; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm15
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1))
-; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm10
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7,8,9],ymm10[10],ymm15[11,12],ymm10[13],ymm15[14,15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm13 & mem)
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm4, %xmm2, %xmm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm5
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm5, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm19
+; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm6
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm4
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm7
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm4 ^ (ymm8 & (ymm2 ^ ymm4))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[5,12],zero,zero,xmm9[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,7,14],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm9, %xmm8, %xmm12
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm10
+; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm13
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm7 ^ (ymm13 & (ymm10 ^ ymm7))
+; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm8[2],ymm13[3,4],ymm8[5],ymm13[6,7,8,9],ymm8[10],ymm13[11,12],ymm8[13],ymm13[14,15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm12 & mem)
; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm16 & (ymm8 ^ ymm12))
-; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm18
-; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm12
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm2 ^ (ymm12 & (ymm3 ^ ymm2))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,u,6,13],zero,zero,xmm12[2,9],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[4,11],zero,zero,xmm12[0,7,14,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm16 & (ymm13 ^ ymm11))
+; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm18
+; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm11
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm3 ^ ymm1))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5,6],ymm8[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm12
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm7 ^ (ymm12 & (ymm6 ^ ymm7))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm15
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[1,8,15],zero,zero,xmm12[4,11],zero,zero,xmm12[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm15, %xmm12, %xmm15
-; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm12
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm9 ^ (ymm12 & (ymm1 ^ ymm9))
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm10[2],ymm12[3,4,5],ymm10[6],ymm12[7,8,9],ymm10[10],ymm12[11,12,13],ymm10[14],ymm12[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm15 & ~mem)
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm19
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm15, %xmm8, %xmm8
-; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm15
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm9 ^ (ymm15 & (ymm1 ^ ymm9))
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1,2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8,9,10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm11
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm4 ^ (ymm11 & (ymm2 ^ ymm4))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm13, %xmm11, %xmm13
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm11
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm7 ^ ymm10))
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4,5],ymm8[6],ymm11[7,8,9],ymm8[10],ymm11[11,12,13],ymm8[14],ymm11[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm13 & ~mem)
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm14 ^ (ymm16 & (ymm11 ^ ymm14))
+; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm7 ^ ymm10))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm13[3],ymm8[4,5],ymm13[6],ymm8[7,8,9,10],ymm13[11],ymm8[12,13],ymm13[14],ymm8[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm13
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm4 ^ (ymm13 & (ymm2 ^ ymm4))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm14, %xmm13, %xmm13
; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17)
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm3 ^ (ymm14 & (ymm2 ^ ymm3))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = (ymm13 & ymm17) | ymm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm3 ^ (ymm15 & (ymm1 ^ ymm3))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm8
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm8, %xmm14, %xmm8
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm12
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm5, %xmm14
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm14[0],xmm12[1],xmm14[1],xmm12[2],xmm14[2],xmm12[3],xmm14[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm8[1,2,3,4,5,6,7],ymm15[8],ymm8[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm20
-; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm8
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm12
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm6, %xmm14
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4,5,6,7],ymm13[8],ymm8[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm19
+; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm1 ^ ymm3))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm13
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8
+; AVX512DQ-NEXT: vpor %xmm13, %xmm8, %xmm8
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[5,12]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-NEXT: vpor %xmm12, %xmm14, %xmm12
-; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[5,12]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-NEXT: vpor %xmm13, %xmm14, %xmm13
+; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm8
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm6 ^ ymm7))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm14, %xmm8, %xmm8
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm14
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9))
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2],ymm10[3],ymm14[4,5,6],ymm10[7,8],ymm14[9,10],ymm10[11],ymm14[12,13,14],ymm10[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17)
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm8))
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm7 ^ ymm10))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm8[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1,2],ymm14[3],ymm8[4,5,6],ymm14[7,8],ymm8[9,10],ymm14[11],ymm8[12,13,14],ymm14[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm14
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm4 ^ (ymm14 & (ymm2 ^ ymm4))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm14
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm17) | ymm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm21
+; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm20
; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm1 ^ ymm3))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u]
; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8
+; AVX512DQ-NEXT: vpor %xmm13, %xmm8, %xmm8
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm5[6,13]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-NEXT: vpor %xmm12, %xmm14, %xmm12
-; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm8
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm14
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm14, %xmm8, %xmm8
-; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm14
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm9 ^ (ymm14 & (ymm1 ^ ymm9))
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm10[0],ymm14[1,2,3],ymm10[4],ymm14[5,6],ymm10[7,8],ymm14[9,10,11],ymm10[12],ymm14[13,14],ymm10[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm8 & ymm17)
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm8
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm2 ^ ymm3))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm6[6,13]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-NEXT: vpor %xmm13, %xmm14, %xmm13
+; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm8))
+; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm7 ^ ymm10))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm8[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1,2,3],ymm14[4],ymm8[5,6],ymm14[7,8],ymm8[9,10,11],ymm14[12],ymm8[13,14],ymm14[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm14
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm4 ^ ymm2))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm17) | ymm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm8, %ymm21
+; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm3 ^ (ymm8 & (ymm1 ^ ymm3))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8
+; AVX512DQ-NEXT: vpor %xmm13, %xmm8, %xmm8
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[0,7,14]
-; AVX512DQ-NEXT: vpor %xmm12, %xmm15, %xmm12
-; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm8))
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm8
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm6 ^ (ymm8 & (ymm7 ^ ymm6))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm15
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm15, %xmm8, %xmm8
-; AVX512DQ-NEXT: vmovdqa %ymm13, %ymm15
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm9 ^ ymm1))
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6,7,8],ymm10[9],ymm15[10,11],ymm10[12],ymm15[13,14,15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm8 & ymm17)
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm12[1,2,3,4,5,6,7],ymm15[8],ymm12[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm3 ^ ymm2))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm5[1,8,15]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[0,7,14]
+; AVX512DQ-NEXT: vpor %xmm13, %xmm15, %xmm13
+; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm8))
+; AVX512DQ-NEXT: vmovdqa %ymm12, %ymm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm7 ^ (ymm8 & (ymm10 ^ ymm7))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm8[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6,7,8],ymm15[9],ymm8[10,11],ymm15[12],ymm8[13,14,15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm15
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm2 ^ (ymm15 & (ymm4 ^ ymm2))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm14
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[5,12],zero,zero,xmm15[1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm14, %xmm15, %xmm14
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm17) | ymm8
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm1 ^ (ymm12 & (ymm3 ^ ymm1))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm1
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm6[1,8,15]
+; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm11 & (ymm7 ^ ymm6))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm4
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm2, %xmm4, %xmm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm9 ^ ymm1))
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7,8],ymm10[9],ymm0[10,11,12],ymm10[13],ymm0[14,15]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm1))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm7 ^ (ymm0 & (ymm10 ^ ymm7))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17)
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm2 ^ (ymm9 & (ymm4 ^ ymm2))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm2
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm17) | ymm0
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa64 %ymm18, (%rsi)
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, (%rdx)
-; AVX512DQ-NEXT: vmovdqa64 %ymm20, (%rcx)
-; AVX512DQ-NEXT: vmovdqa64 %ymm21, (%r8)
-; AVX512DQ-NEXT: vmovdqa %ymm14, (%r9)
+; AVX512DQ-NEXT: vmovdqa %ymm11, (%rdx)
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, (%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, (%r8)
+; AVX512DQ-NEXT: vmovdqa64 %ymm21, (%r9)
; AVX512DQ-NEXT: vmovdqa %ymm8, (%r10)
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax)
; AVX512DQ-NEXT: vzeroupper
@@ -6242,198 +6266,202 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm4
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm2 ^ (ymm1 & (ymm3 ^ ymm2))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm1 & (ymm4 ^ ymm3))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm5 ^ (ymm7 & (ymm4 ^ ymm5))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm11
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm1 ^ (ymm11 & (ymm7 ^ ymm1))
-; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm8
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm8[2],ymm11[3,4],ymm8[5],ymm11[6,7,8,9],ymm8[10],ymm11[11,12],ymm8[13],ymm11[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm10 & mem)
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6]
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm18
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm18 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0,7,14],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm10
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm8 ^ ymm5))
+; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm13
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7,8,9],ymm13[10],ymm6[11,12],ymm13[13],ymm6[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm10 & mem)
; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,16777215,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm16 & (ymm11 ^ ymm6))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm18
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm2 ^ (ymm6 & (ymm3 ^ ymm2))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm10, %xmm6
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm9 ^ (ymm16 & (ymm6 ^ ymm9))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm9
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm3 ^ (ymm9 & (ymm4 ^ ymm3))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,4,6]
-; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm10, %ymm10
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm10
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm5 ^ (ymm10 & (ymm4 ^ ymm5))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm9[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm9
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm2 ^ (ymm9 & (ymm18 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm14
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm10, %xmm14
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm10
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm7 ^ (ymm10 & (ymm1 ^ ymm7))
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm8[2],ymm10[3,4,5],ymm8[6],ymm10[7,8,9],ymm8[10],ymm10[11,12,13],ymm8[14],ymm10[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm14 & ~mem)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm16 & (ymm10 ^ ymm6))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm14, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm14
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm7 ^ (ymm14 & (ymm1 ^ ymm7))
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1,2],ymm8[3],ymm14[4,5],ymm8[6],ymm14[7,8,9,10],ymm8[11],ymm14[12,13],ymm8[14],ymm14[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm14[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 | (ymm6 & ymm17)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm3 ^ (ymm13 & (ymm2 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm9, %xmm14
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm9
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm8 ^ (ymm9 & (ymm5 ^ ymm8))
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm13[2],ymm9[3,4,5],ymm13[6],ymm9[7,8,9],ymm13[10],ymm9[11,12,13],ymm13[14],ymm9[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm14 & ~mem)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm15 ^ (ymm16 & (ymm9 ^ ymm15))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm8 ^ (ymm13 & (ymm5 ^ ymm8))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm13[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7,8,9,10],ymm14[11],ymm13[12,13],ymm14[14],ymm13[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm2 ^ (ymm14 & (ymm18 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm16) | ymm13
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm4 ^ (ymm12 & (ymm3 ^ ymm4))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6]
-; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm13, %ymm12
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1,2,3,4,5,6,7],ymm14[8],ymm6[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm19
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm6
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12
+; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm13, %ymm11
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6,7],ymm14[8],ymm11[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm19
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm11
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm4 ^ (ymm11 & (ymm3 ^ ymm4))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm12, %xmm6
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm14
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm16 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm5 ^ (ymm6 & (ymm4 ^ ymm5))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm13
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[5,12]
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm11
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm17 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm17 & (ymm14 ^ ymm12))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm8 ^ (ymm12 & (ymm5 ^ ymm8))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm12[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1,2],ymm15[3],ymm12[4,5,6],ymm15[7,8],ymm12[9,10],ymm15[11],ymm12[12,13,14],ymm15[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm15
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm2 ^ (ymm15 & (ymm18 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm15, %xmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm16) | ymm12
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm1[0],ymm14[1,2,3,4,5,6,7],ymm1[8],ymm14[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm3 ^ ymm4))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm12, %xmm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm13[6,13]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm15, %xmm12
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm17 & (ymm12 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm8 ^ (ymm1 & (ymm5 ^ ymm8))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3],ymm15[4],ymm1[5,6],ymm15[7,8],ymm1[9,10,11],ymm15[12],ymm1[13,14],ymm15[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm15
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm18 ^ (ymm15 & (ymm2 ^ ymm18))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm14
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = (ymm14 & ymm16) | ymm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm14[0],ymm12[1,2,3,4,5,6,7],ymm14[8],ymm12[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm21
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm4 ^ (ymm1 & (ymm3 ^ ymm4))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[1,8,15],zero,zero,xmm1[4,11,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm14, %xmm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm13[0,7,14]
+; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm17 & (ymm14 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm5 ^ (ymm1 & (ymm8 ^ ymm5))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm1[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6,7,8],ymm15[9],ymm1[10,11],ymm15[12],ymm1[13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7))
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2],ymm8[3],ymm15[4,5,6],ymm8[7,8],ymm15[9,10],ymm8[11],ymm15[12,13,14],ymm8[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm20
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm6[u,u,2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm13, %xmm6
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm6
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm15
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm7 ^ (ymm15 & (ymm1 ^ ymm7))
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm21
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm6
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm3 ^ (ymm6 & (ymm2 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
-; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm16 & (ymm13 ^ ymm6))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm4 ^ (ymm6 & (ymm5 ^ ymm4))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm15, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm15
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm1 ^ (ymm15 & (ymm7 ^ ymm1))
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0],ymm8[1],ymm15[2,3],ymm8[4],ymm15[5,6,7,8],ymm8[9],ymm15[10,11],ymm8[12],ymm15[13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm6 & ymm17)
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm15[0],ymm13[1,2,3,4,5,6,7],ymm15[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm15[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm2 ^ (ymm11 & (ymm3 ^ ymm2))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,4,11],zero,zero,xmm11[0,7,14],zero,zero,xmm11[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm18 ^ (ymm15 & (ymm2 ^ ymm18))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm12
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[5,12],zero,zero,xmm15[1,8,15],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm15, %xmm12
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = (ymm12 & ymm16) | ymm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm14[1,2,3,4,5,6,7],ymm12[8],ymm14[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm12[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm3 ^ (ymm10 & (ymm4 ^ ymm3))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[5,12,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm11[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm13[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm13[1,8,15]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm16 & (ymm3 ^ ymm2))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm4 ^ (ymm9 & (ymm5 ^ ymm4))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm0 & (ymm7 ^ ymm1))
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7,8],ymm8[9],ymm0[10,11,12],ymm8[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm17 & (ymm3 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm5 ^ (ymm0 & (ymm8 ^ ymm5))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7,8],ymm1[9],ymm0[10,11,12],ymm1[13],ymm0[14,15]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm2 & ymm17)
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm3[1,2,3,4,5,6,7],ymm0[8],ymm3[9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%rdx)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm18 ^ (ymm7 & (ymm2 ^ ymm18))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & ymm16) | ymm0
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm3[1,2,3,4,5,6,7],ymm1[8],ymm3[9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rsi)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%rdx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, (%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, (%r8)
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r10)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm15, (%r10)
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
@@ -6442,186 +6470,194 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
-; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm5
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm11
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm12
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm10
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
-; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm6
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm4
+; AVX512BW-NEXT: vmovdqa 96(%rdi), %ymm3
; AVX512BW-NEXT: movw $-28382, %r11w # imm = 0x9122
; AVX512BW-NEXT: kmovd %r11d, %k5
-; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm7
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm7, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: movw $992, %r11w # imm = 0x3E0
+; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm0 {%k5}
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
+; AVX512BW-NEXT: movw $9288, %r11w # imm = 0x2448
; AVX512BW-NEXT: kmovd %r11d, %k1
-; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1}
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm7
-; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm6
+; AVX512BW-NEXT: vpblendmw %ymm4, %ymm3, %ymm5 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: movw $992, %r11w # imm = 0x3E0
+; AVX512BW-NEXT: kmovd %r11d, %k2
+; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm0 {%k2}
+; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm6
+; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm5
; AVX512BW-NEXT: movw $8772, %r11w # imm = 0x2244
-; AVX512BW-NEXT: kmovd %r11d, %k1
-; AVX512BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm8 {%k1}
-; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm9, %xmm8, %xmm8
-; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm13
-; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm8
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm8, %xmm15
-; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm9
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
+; AVX512BW-NEXT: kmovd %r11d, %k2
+; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm7 {%k2}
+; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm9
+; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm7
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512BW-NEXT: vpshufb %xmm10, %xmm7, %xmm11
+; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm8
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7]
; AVX512BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000
; AVX512BW-NEXT: kmovd %edi, %k4
-; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm1 {%k4}
+; AVX512BW-NEXT: vmovdqu8 %ymm9, %ymm0 {%k4}
; AVX512BW-NEXT: movw $4644, %di # imm = 0x1224
-; AVX512BW-NEXT: kmovd %edi, %k2
-; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
-; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm15
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm15, %xmm13, %xmm13
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF
-; AVX512BW-NEXT: kmovd %edi, %k3
-; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k3}
-; AVX512BW-NEXT: movw $9288, %di # imm = 0x2448
; AVX512BW-NEXT: kmovd %edi, %k3
-; AVX512BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm13 {%k3}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u]
-; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm15, %xmm13, %xmm13
-; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
-; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k4}
-; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k1}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm15, %xmm13, %xmm13
+; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm9 {%k3}
+; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm11
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[6,13],zero,zero,xmm11[2,9,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm11, %xmm9, %xmm11
+; AVX512BW-NEXT: vpblendmw %ymm3, %ymm4, %ymm9 {%k5}
+; AVX512BW-NEXT: vextracti128 $1, %ymm9, %xmm12
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4,5],ymm12[6],ymm9[7,8,9],ymm12[10],ymm9[11,12,13],ymm12[14],ymm9[15]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF
+; AVX512BW-NEXT: kmovd %edi, %k6
+; AVX512BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k6}
+; AVX512BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm11 {%k1}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX512BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4}
+; AVX512BW-NEXT: vpblendmw %ymm3, %ymm4, %ymm11 {%k3}
+; AVX512BW-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
+; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm12 {%k2}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm13, %xmm12, %xmm12
; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00
; AVX512BW-NEXT: kmovd %edi, %k4
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm12 {%k5}
-; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm15
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm15, %xmm12, %xmm12
-; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm9, %xmm14
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k2}
-; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm11 {%k5}
+; AVX512BW-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,0,7,14],zero,zero,xmm11[3,10],zero,zero,zero,xmm11[u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm13, %xmm11, %xmm11
+; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT: vpshufb %xmm10, %xmm8, %xmm10
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm10[7]
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6,7],ymm12[8],ymm10[9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
+; AVX512BW-NEXT: vpblendmw %ymm3, %ymm4, %ymm11 {%k2}
+; AVX512BW-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4,5,6],ymm12[7,8],ymm11[9,10],ymm12[11],ymm11[12,13,14],ymm12[15]
+; AVX512BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm12 {%k1}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm11 {%k3}
+; AVX512BW-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm13, %xmm11, %xmm11
+; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512BW-NEXT: vpor %xmm13, %xmm14, %xmm13
; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14
-; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000
; AVX512BW-NEXT: kmovd %edi, %k5
-; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm11 {%k5}
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-NEXT: vpblendmw %ymm3, %ymm4, %ymm12 {%k1}
+; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3],ymm13[4],ymm12[5,6],ymm13[7,8],ymm12[9,10,11],ymm13[12],ymm12[13,14],ymm13[15]
+; AVX512BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm13 {%k3}
+; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13
-; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm12 {%k2}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm14, %xmm12, %xmm12
+; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14
; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
-; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm14, %xmm13, %xmm13
-; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14]
-; AVX512BW-NEXT: vpor %xmm14, %xmm15, %xmm14
-; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
+; AVX512BW-NEXT: vmovdqu8 %ymm14, %ymm12 {%k5}
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-NEXT: vpblendmw %ymm4, %ymm3, %ymm13 {%k3}
+; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13,14,15]
+; AVX512BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm14 {%k2}
; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm15
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2}
-; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15]
-; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5}
-; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512BW-NEXT: vmovdqa %ymm10, (%rdx)
-; AVX512BW-NEXT: vmovdqa %ymm12, (%rcx)
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm13 {%k1}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm15, %xmm13, %xmm13
+; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
+; AVX512BW-NEXT: vporq %xmm15, %xmm16, %xmm15
+; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT: vmovdqu8 %ymm15, %ymm13 {%k5}
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
+; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k2}
+; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
+; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 {%k4} = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3}
+; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm2
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
+; AVX512BW-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k5}
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT: vmovdqa %ymm9, (%rdx)
+; AVX512BW-NEXT: vmovdqa %ymm10, (%rcx)
; AVX512BW-NEXT: vmovdqa %ymm11, (%r8)
-; AVX512BW-NEXT: vmovdqa %ymm5, (%r9)
-; AVX512BW-NEXT: vmovdqa %ymm4, (%r10)
-; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512BW-NEXT: vmovdqa %ymm12, (%r9)
+; AVX512BW-NEXT: vmovdqa %ymm13, (%r10)
+; AVX512BW-NEXT: vmovdqa %ymm1, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
@@ -6629,182 +6665,182 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm4
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm5
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm9
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm10
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm8
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm6
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm3
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122
; AVX512BW-FCP-NEXT: kmovd %r11d, %k5
; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,17,10,3,20,13,6,0,24,0,0,27,0,0,0,31]
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: movw $992, %r11w # imm = 0x3E0
; AVX512BW-FCP-NEXT: kmovd %r11d, %k1
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm7
-; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4
; AVX512BW-FCP-NEXT: movw $8772, %r11w # imm = 0x2244
; AVX512BW-FCP-NEXT: kmovd %r11d, %k1
-; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k1}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6]
-; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13
-; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,2,4,6]
+; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm7
+; AVX512BW-FCP-NEXT: vpermd %ymm7, %ymm8, %ymm8
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
; AVX512BW-FCP-NEXT: movl $-524288, %r11d # imm = 0xFFF80000
; AVX512BW-FCP-NEXT: kmovd %r11d, %k4
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm1 {%k4}
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm1 {%k4}
; AVX512BW-FCP-NEXT: movw $4644, %r11w # imm = 0x1224
; AVX512BW-FCP-NEXT: kmovd %r11d, %k2
-; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k2}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k2}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,17,10,3,0,21,14,7,24,0,0,0,28,0,0,31]
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm6, %zmm6
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: movl $511, %r11d # imm = 0x1FF
; AVX512BW-FCP-NEXT: kmovd %r11d, %k3
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3}
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm6 {%k3}
; AVX512BW-FCP-NEXT: movw $9288, %r11w # imm = 0x2448
; AVX512BW-FCP-NEXT: kmovd %r11d, %k3
-; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6]
-; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k4}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm8 {%k3}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,3,4,6]
+; AVX512BW-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm9
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm6 {%k4}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,18,11,4,21,14,7,0,25,0,0,28,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm9
; AVX512BW-FCP-NEXT: movl $261632, %r11d # imm = 0x3FE00
; AVX512BW-FCP-NEXT: kmovd %r11d, %k4
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm10 {%k5}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6]
-; AVX512BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm11 {%k2}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm11
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12]
-; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm12
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm9 {%k5}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,5,6]
+; AVX512BW-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7]
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm8 {%k2}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm9
+; AVX512BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm8
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
+; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm10
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
; AVX512BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000
; AVX512BW-FCP-NEXT: kmovd %edi, %k5
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k5}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k3}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [8,1,18,11,4,0,22,15,0,25,0,0,0,29,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm12, %zmm12
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm11 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512BW-FCP-NEXT: vpor %xmm12, %xmm13, %xmm12
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm11 {%k5}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm12 {%k2}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [8,1,0,19,12,5,22,15,0,0,26,0,0,29,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm13, %zmm13
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm12 {%k3}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
+; AVX512BW-FCP-NEXT: vpor %xmm13, %xmm14, %xmm13
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k5}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm13 {%k1}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14]
-; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15]
-; AVX512BW-FCP-NEXT: vpor %xmm7, %xmm11, %xmm7
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5}
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,9,2,19,12,5,0,23,0,0,26,0,0,0,30,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm14[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k2}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
+; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k5}
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [16,9,2,0,20,13,6,23,0,0,0,27,0,0,30,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm3, %zmm0
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm10, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rcx)
; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r9)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%r10)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm11, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm12, (%r10)
; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
@@ -6813,186 +6849,194 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
-; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm4
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm5
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm11
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm12
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm10
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
-; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm6
-; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm4
+; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %ymm3
; AVX512DQ-BW-NEXT: movw $-28382, %r11w # imm = 0x9122
; AVX512DQ-BW-NEXT: kmovd %r11d, %k5
-; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm7
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm1, %xmm1
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: movw $992, %r11w # imm = 0x3E0
+; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm0 {%k5}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm5
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm0, %xmm0
+; AVX512DQ-BW-NEXT: movw $9288, %r11w # imm = 0x2448
; AVX512DQ-BW-NEXT: kmovd %r11d, %k1
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1}
-; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm7
-; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm6
+; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm3, %ymm5 {%k1}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: movw $992, %r11w # imm = 0x3E0
+; AVX512DQ-BW-NEXT: kmovd %r11d, %k2
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm5, %ymm0 {%k2}
+; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm6
+; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm5
; AVX512DQ-BW-NEXT: movw $8772, %r11w # imm = 0x2244
-; AVX512DQ-BW-NEXT: kmovd %r11d, %k1
-; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm8 {%k1}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm9
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm8, %xmm8
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm8
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm8, %xmm15
-; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm9
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm16[0],xmm15[1],xmm16[1],xmm15[2],xmm16[2],xmm15[3],xmm16[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-BW-NEXT: kmovd %r11d, %k2
+; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm7 {%k2}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm9
+; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm7
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm7, %xmm11
+; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm8
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7]
; AVX512DQ-BW-NEXT: movl $-524288, %edi # imm = 0xFFF80000
; AVX512DQ-BW-NEXT: kmovd %edi, %k4
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm1 {%k4}
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm9, %ymm0 {%k4}
; AVX512DQ-BW-NEXT: movw $4644, %di # imm = 0x1224
-; AVX512DQ-BW-NEXT: kmovd %edi, %k2
-; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k2}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm15
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm13, %xmm13
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: movl $511, %edi # imm = 0x1FF
; AVX512DQ-BW-NEXT: kmovd %edi, %k3
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k3}
-; AVX512DQ-BW-NEXT: movw $9288, %di # imm = 0x2448
-; AVX512DQ-BW-NEXT: kmovd %edi, %k3
-; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm6, %ymm13 {%k3}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm13, %xmm13
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm10 {%k4}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm13 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm13, %xmm13
+; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm9 {%k3}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm11
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm11[6,13],zero,zero,xmm11[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,8,15],zero,zero,xmm9[4,11],zero,zero,xmm9[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm9, %xmm11
+; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm4, %ymm9 {%k5}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm9, %xmm12
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm12[2],ymm9[3,4,5],ymm12[6],ymm9[7,8,9],ymm12[10],ymm9[11,12,13],ymm12[14],ymm9[15]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: movl $511, %edi # imm = 0x1FF
+; AVX512DQ-BW-NEXT: kmovd %edi, %k6
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k6}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm5, %ymm11 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k4}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm4, %ymm11 {%k3}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm12 {%k2}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm12, %xmm12
; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00
; AVX512DQ-BW-NEXT: kmovd %edi, %k4
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm12 {%k5}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm15
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm12, %xmm12
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm9, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
-; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k2}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm11 {%k5}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,0,7,14],zero,zero,xmm11[3,10],zero,zero,zero,xmm11[u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm11, %xmm11
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm8, %xmm10
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1,2,3,4,5,6,7],ymm12[8],ymm10[9,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5,6,7]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm4, %ymm11 {%k2}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm11, %xmm12
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2],ymm12[3],ymm11[4,5,6],ymm12[7,8],ymm11[9,10],ymm12[11],ymm11[12,13,14],ymm12[15]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm2, %ymm12 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm11 {%k3}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm11, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm11, %xmm11
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-BW-NEXT: vpor %xmm13, %xmm14, %xmm13
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512DQ-BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000
; AVX512DQ-BW-NEXT: kmovd %edi, %k5
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm11[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3],ymm11[4,5,6,7]
-; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm13, %ymm11 {%k5}
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm4, %ymm12 {%k1}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3],ymm13[4],ymm12[5,6],ymm13[7,8],ymm12[9,10,11],ymm13[12],ymm12[13,14],ymm13[15]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm13 {%k3}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm12 {%k2}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm12, %xmm12
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm12 {%k5}
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm4, %ymm3, %ymm13 {%k3}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7,8],ymm14[9],ymm13[10,11],ymm14[12],ymm13[13,14,15]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm1, %ymm14 {%k2}
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm6, %ymm13 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm13
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm13, %xmm13
+; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm13, %xmm13
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14]
-; AVX512DQ-BW-NEXT: vpor %xmm14, %xmm15, %xmm14
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15]
-; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5}
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm10, (%rdx)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm12, (%rcx)
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm16 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
+; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm16, %xmm15
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm13 {%k5}
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k2}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm1 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 {%k4} = ymm3[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm5 {%k3}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm2
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
+; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k5}
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm9, (%rdx)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm10, (%rcx)
; AVX512DQ-BW-NEXT: vmovdqa %ymm11, (%r8)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%r9)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%r10)
-; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm12, (%r9)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm13, (%r10)
+; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
@@ -7000,182 +7044,182 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm4
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm3
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
; AVX512DQ-BW-FCP-NEXT: movw $-28382, %r11w # imm = 0x9122
; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k5
; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm1 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm1, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,17,10,3,20,13,6,0,24,0,0,27,0,0,0,31]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: movw $992, %r11w # imm = 0x3E0
; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm7
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm4
; AVX512DQ-BW-FCP-NEXT: movw $8772, %r11w # imm = 0x2244
; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,5,12],zero,zero,xmm11[1,8,15],zero,zero,xmm11[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,2,4,6]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,5,12],zero,zero,xmm6[1,8,15],zero,zero,xmm6[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,2,4,6]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm7, %ymm8, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
; AVX512DQ-BW-FCP-NEXT: movl $-524288, %r11d # imm = 0xFFF80000
; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k4
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm1 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm6, %ymm1 {%k4}
; AVX512DQ-BW-FCP-NEXT: movw $4644, %r11w # imm = 0x1224
; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm6 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[1,8,15],zero,zero,xmm6[4,11],zero,zero,xmm6[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,17,10,3,0,21,14,7,24,0,0,0,28,0,0,31]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: movl $511, %r11d # imm = 0x1FF
; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm6 {%k3}
; AVX512DQ-BW-FCP-NEXT: movw $9288, %r11w # imm = 0x2448
; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k3
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm6, %ymm11 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,6,13],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u],zero,zero,xmm11[4,11],zero,zero,xmm11[0,7,14,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,4,6]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm8 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm8 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,3,4,6]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm8, %ymm6 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,0,18,11,4,21,14,7,0,25,0,0,28,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm9
; AVX512DQ-BW-FCP-NEXT: movl $261632, %r11d # imm = 0x3FE00
; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k4
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm10[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm10 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[1,8,15,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,0,7,14],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,0,0,0,1,3,5,6]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm13, %ymm12, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7]
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm11 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[2,9,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm9 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[1,8,15,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,0,7,14],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,3,5,6]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm8 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm12, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %edi # imm = 0xF8000000
; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm9 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm3, %ymm11 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [8,1,18,11,4,0,22,15,0,25,0,0,0,29,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm12, %zmm12
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 {%k4} = ymm12[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3,4,5,6,7],ymm11[8],ymm9[9,10,11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm11 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm12, %xmm13, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm12, %ymm11 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [8,1,0,19,12,5,22,15,0,0,26,0,0,29,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm13, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 {%k4} = ymm13[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1,2,3,4,5,6,7],ymm12[8],ymm11[9,10,11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm12 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm12[u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u],zero,zero,xmm12[1,8,15],zero,zero,xmm12[4,11,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm10[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm13, %xmm14, %xmm13
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm5[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm7, %ymm13 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm13, %ymm12 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm13 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm12[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[0,7,14]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm13 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm2, %ymm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k4} = ymm4[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm13[1,2,3,4,5,6,7],ymm14[8],ymm13[9,10,11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3],ymm4[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm6 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm12[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm11[1,8,15]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm7, %xmm11, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm7, %ymm6 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,9,2,19,12,5,0,23,0,0,26,0,0,0,30,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 {%k4} = ymm14[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4,5,6,7],ymm13[8],ymm12[9,10,11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm5, %ymm4 {%k5}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm3, %ymm2 {%k3}
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [16,9,2,0,20,13,6,23,0,0,0,27,0,0,30,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm3, %zmm0
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k4} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm4[1,2,3,4,5,6,7],ymm2[8],ymm4[9,10,11,12,13,14,15]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm10, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%r10)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm11, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm12, (%r10)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
@@ -11698,1635 +11742,1628 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512-LABEL: load_i8_stride7_vf64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512-NEXT: vmovdqa (%rdi), %ymm12
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm13
-; AVX512-NEXT: vmovdqa64 64(%rdi), %ymm31
-; AVX512-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-NEXT: vmovdqa64 %ymm0, %ymm24
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm13 ^ (ymm1 & (ymm12 ^ ymm13))
+; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm24 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512-NEXT: vmovdqa64 (%rdi), %ymm18
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512-NEXT: vmovdqa 64(%rdi), %ymm13
+; AVX512-NEXT: vmovdqa64 %ymm24, %ymm1
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm12 ^ (ymm1 & (ymm18 ^ ymm12))
; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT: vmovdqa64 96(%rdi), %ymm19
-; AVX512-NEXT: vmovdqa %ymm9, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm19 ^ ymm31))
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7,8,9],ymm11[10],ymm2[11,12],ymm11[13],ymm2[14,15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem)
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm21
-; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm29
-; AVX512-NEXT: vmovdqa %ymm14, %ymm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm21 ^ (ymm1 & (ymm29 ^ ymm21))
-; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT: vmovdqa 96(%rdi), %ymm10
+; AVX512-NEXT: vmovdqa %ymm8, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm10 ^ ymm13))
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm7
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm1 & mem)
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
+; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm30
+; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm31
+; AVX512-NEXT: vmovdqa %ymm9, %ymm1
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm30 ^ (ymm1 & (ymm31 ^ ymm30))
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vmovdqa 192(%rdi), %xmm0
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
-; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm4, %xmm27
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm20
-; AVX512-NEXT: vmovdqa 208(%rdi), %xmm10
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm25
+; AVX512-NEXT: vmovdqa 208(%rdi), %xmm4
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-NEXT: vmovdqa 240(%rdi), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm0[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa 224(%rdi), %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa64 %xmm2, %xmm22
+; AVX512-NEXT: vpor %xmm11, %xmm14, %xmm11
+; AVX512-NEXT: vinserti32x4 $2, %xmm11, %zmm1, %zmm11
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm26 & (zmm11 ^ zmm5))
+; AVX512-NEXT: vmovdqa64 288(%rdi), %ymm19
+; AVX512-NEXT: vmovdqa64 256(%rdi), %ymm17
+; AVX512-NEXT: vmovdqa %ymm8, %ymm5
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm19 ^ (ymm5 & (ymm17 ^ ymm19))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[u,u,u,u,u,3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u]
+; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u]
+; AVX512-NEXT: vpor %xmm5, %xmm14, %xmm14
+; AVX512-NEXT: vmovdqa 352(%rdi), %ymm6
+; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm16
+; AVX512-NEXT: vmovdqa %ymm9, %ymm15
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm6 ^ (ymm15 & (ymm16 ^ ymm6))
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm15[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0,1],ymm2[2],ymm15[3,4,5],ymm2[6],ymm15[7,8,9],ymm2[10],ymm15[11,12,13],ymm2[14],ymm15[15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm27 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm14 & ymm27)
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512-NEXT: vmovdqa64 416(%rdi), %ymm20
+; AVX512-NEXT: vmovdqa64 384(%rdi), %ymm21
+; AVX512-NEXT: vmovdqa %ymm15, %ymm14
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm20 ^ (ymm14 & (ymm21 ^ ymm20))
+; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm3
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero
+; AVX512-NEXT: vpor %xmm3, %xmm14, %xmm3
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-NEXT: vmovdqa 240(%rdi), %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovdqa 224(%rdi), %xmm6
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm5, %xmm7, %xmm5
-; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm22
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm4 & (zmm22 ^ zmm2))
-; AVX512-NEXT: vmovdqa64 288(%rdi), %ymm18
-; AVX512-NEXT: vmovdqa64 256(%rdi), %ymm16
-; AVX512-NEXT: vmovdqa %ymm9, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
-; AVX512-NEXT: vpor %xmm5, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa64 352(%rdi), %ymm17
-; AVX512-NEXT: vmovdqa64 320(%rdi), %ymm28
-; AVX512-NEXT: vmovdqa %ymm14, %ymm7
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm28 ^ ymm17))
-; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm23 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm2 & ymm23)
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512-NEXT: vmovdqa %ymm7, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm12 ^ ymm13))
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm15
-; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm2))
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm2
+; AVX512-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm29 & (zmm2 ^ zmm11))
+; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa %ymm15, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (ymm2 & (ymm18 ^ ymm12))
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm2, %xmm15, %xmm2
-; AVX512-NEXT: vmovdqa64 %ymm24, %ymm15
-; AVX512-NEXT: vmovdqa64 %ymm24, %ymm5
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm19 ^ (ymm15 & (ymm31 ^ ymm19))
-; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7,8,9],ymm11[10],ymm15[11,12,13],ymm11[14],ymm15[15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm2 & ~mem)
-; AVX512-NEXT: vmovdqa %ymm9, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm29 ^ ymm21))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vmovdqa64 %ymm24, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm13 ^ ymm10))
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm7[2],ymm3[3,4,5],ymm7[6],ymm3[7,8,9],ymm7[10],ymm3[11,12,13],ymm7[14],ymm3[15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ~mem)
+; AVX512-NEXT: vmovdqa %ymm8, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm2 & (ymm31 ^ ymm30))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u]
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u]
-; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovdqa64 %xmm3, %xmm25
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovdqa64 %xmm6, %xmm26
-; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm1
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm24
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm4 & (zmm24 ^ zmm15))
-; AVX512-NEXT: vmovdqa %ymm14, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa %ymm7, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm31 ^ ymm19))
-; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm15 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm15)
-; AVX512-NEXT: vmovdqa %ymm5, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm21 ^ ymm29))
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3
-; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm5
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512-NEXT: vpor %xmm2, %xmm11, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm1
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm0[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa64 %xmm22, %xmm5
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm11, %xmm14, %xmm11
+; AVX512-NEXT: vinserti32x4 $2, %xmm11, %zmm2, %zmm2
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm26 & (zmm2 ^ zmm3))
+; AVX512-NEXT: vmovdqa %ymm15, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm19 ^ ymm17))
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm11, %xmm3
+; AVX512-NEXT: vmovdqa %ymm8, %ymm11
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm6 ^ (ymm11 & (ymm16 ^ ymm6))
+; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5],ymm14[6],ymm11[7,8,9,10],ymm14[11],ymm11[12,13],ymm14[14],ymm11[15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm3 & ymm27)
+; AVX512-NEXT: vmovdqa %ymm9, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm20 ^ (ymm3 & (ymm21 ^ ymm20))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
+; AVX512-NEXT: vpor %xmm3, %xmm14, %xmm3
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-NEXT: vmovdqa64 %xmm26, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm11))
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm26
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm29 & (zmm26 ^ zmm2))
+; AVX512-NEXT: vmovdqa %ymm9, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (ymm2 & (ymm18 ^ ymm12))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512-NEXT: vmovdqa %ymm15, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm13 ^ ymm10))
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3],ymm3[4,5],ymm7[6],ymm3[7,8,9,10],ymm7[11],ymm3[12,13],ymm7[14],ymm3[15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm28 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm28)
+; AVX512-NEXT: vmovdqa64 %ymm24, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm30 ^ ymm31))
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm11
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[1,8,15,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u]
+; AVX512-NEXT: vpor %xmm2, %xmm11, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm6, %xmm3
-; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm25
-; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm2 ^ (zmm20 & (zmm25 ^ zmm2))
-; AVX512-NEXT: vmovdqa %ymm9, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa %ymm14, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm19 ^ (ymm2 & (ymm31 ^ ymm19))
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2],ymm11[3],ymm2[4,5,6],ymm11[7,8],ymm2[9,10],ymm11[11],ymm2[12,13,14],ymm11[15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm15)
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm11, %xmm14, %xmm11
+; AVX512-NEXT: vinserti32x4 $2, %xmm11, %zmm2, %zmm2
+; AVX512-NEXT: vpmovsxdq {{.*#+}} zmm25 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm25 & (zmm2 ^ zmm3))
+; AVX512-NEXT: vmovdqa %ymm9, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm19 ^ ymm17))
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm11, %xmm3
; AVX512-NEXT: vmovdqa %ymm15, %ymm11
-; AVX512-NEXT: vmovdqa %ymm7, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm21 ^ ymm29))
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm6
-; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512-NEXT: vmovdqa %xmm5, %xmm10
-; AVX512-NEXT: vpor %xmm6, %xmm15, %xmm6
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm0))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm0, %xmm15, %xmm0
-; AVX512-NEXT: vmovdqa64 416(%rdi), %ymm26
-; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm6, %zmm30
-; AVX512-NEXT: vmovdqa64 384(%rdi), %ymm27
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm3 ^ (zmm20 & (zmm30 ^ zmm3))
-; AVX512-NEXT: vmovdqa %ymm7, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero
-; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm8))
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20
-; AVX512-NEXT: vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm8 & (zmm20 ^ zmm22))
-; AVX512-NEXT: vmovdqa %ymm7, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm18 ^ ymm16))
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm16 ^ (ymm11 & (ymm6 ^ ymm16))
+; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2],ymm14[3],ymm11[4,5,6],ymm14[7,8],ymm11[9,10],ymm14[11],ymm11[12,13,14],ymm14[15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm3 & ymm27)
+; AVX512-NEXT: vmovdqa %ymm8, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm20 ^ (ymm3 & (ymm21 ^ ymm20))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
+; AVX512-NEXT: vpor %xmm3, %xmm14, %xmm3
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm11))
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm2))
+; AVX512-NEXT: vmovdqa %ymm8, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (ymm2 & (ymm18 ^ ymm12))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX512-NEXT: vmovdqa %ymm9, %ymm3
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm28 ^ ymm17))
-; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5],ymm6[6],ymm3[7,8,9,10],ymm6[11],ymm3[12,13],ymm6[14],ymm3[15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm23)
-; AVX512-NEXT: vmovdqa %ymm14, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10]
-; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm3))
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm22
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm8 & (zmm22 ^ zmm24))
-; AVX512-NEXT: vmovdqa %ymm14, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm18 ^ ymm16))
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa %ymm7, %ymm3
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm28 ^ (ymm3 & (ymm17 ^ ymm28))
-; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1,2],ymm6[3],ymm3[4,5,6],ymm6[7,8],ymm3[9,10],ymm6[11],ymm3[12,13,14],ymm6[15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm23)
-; AVX512-NEXT: vmovdqa %ymm9, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm0[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11]
-; AVX512-NEXT: vpor %xmm6, %xmm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm3))
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm8 & (zmm24 ^ zmm25))
-; AVX512-NEXT: vmovdqa %ymm14, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm17 ^ ymm28))
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6],ymm3[7,8],ymm0[9,10,11],ymm3[12],ymm0[13,14],ymm3[15]
-; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm13 ^ ymm10))
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4,5,6],ymm7[7,8],ymm3[9,10],ymm7[11],ymm3[12,13,14],ymm7[15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm28)
+; AVX512-NEXT: vmovdqa %ymm15, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm30 ^ ymm31))
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm7, %xmm2, %xmm2
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpshufb %xmm7, %xmm4, %xmm11
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512-NEXT: vpor %xmm11, %xmm14, %xmm11
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (mem & (ymm11 ^ ymm2))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm0[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm2, %xmm14, %xmm2
+; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm11, %zmm2
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm25 & (zmm2 ^ zmm3))
+; AVX512-NEXT: vmovdqa %ymm9, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm6 ^ ymm16))
+; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm3[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm3[1,2,3],ymm11[4],ymm3[5,6],ymm11[7,8],ymm3[9,10,11],ymm11[12],ymm3[13,14],ymm11[15]
+; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm3
+; AVX512-NEXT: vmovdqa %ymm8, %ymm7
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm19 ^ ymm17))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u]
+; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u]
+; AVX512-NEXT: vpor %xmm7, %xmm11, %xmm7
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = (ymm7 & ymm27) | ymm3
+; AVX512-NEXT: vmovdqa %ymm15, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm21 ^ (ymm3 & (ymm20 ^ ymm21))
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
+; AVX512-NEXT: vpor %xmm3, %xmm11, %xmm3
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm7))
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm25
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm29 & (zmm25 ^ zmm2))
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vmovdqa %ymm8, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm6 ^ ymm16))
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
+; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm2
+; AVX512-NEXT: vmovdqa64 %ymm24, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm17 ^ ymm19))
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
+; AVX512-NEXT: vpor %xmm7, %xmm3, %xmm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm27) | ymm2
; AVX512-NEXT: vmovdqa %ymm9, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm18 ^ ymm16))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm20 ^ ymm21))
+; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
+; AVX512-NEXT: vpor %xmm7, %xmm2, %xmm2
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm23 & (ymm7 ^ ymm3))
+; AVX512-NEXT: vmovdqa64 %ymm24, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm6 ^ (ymm2 & (ymm16 ^ ymm6))
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
+; AVX512-NEXT: vmovdqa %ymm15, %ymm3
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm17 ^ ymm19))
+; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm11
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[2,9,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm11, %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb %ymm5, %ymm2, %ymm2
+; AVX512-NEXT: vmovdqa64 %ymm5, %ymm29
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm2
+; AVX512-NEXT: vmovdqa %ymm8, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm20 ^ ymm21))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm23) | ymm0
-; AVX512-NEXT: vmovdqa %ymm7, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero
-; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm1 & (ymm0 ^ ymm2))
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm8 & (zmm25 ^ zmm30))
-; AVX512-NEXT: vmovdqa %ymm9, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm17 ^ ymm28))
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512-NEXT: vmovdqa %ymm4, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
+; AVX512-NEXT: vpor %xmm2, %xmm11, %xmm2
+; AVX512-NEXT: vinserti32x4 $1, %xmm2, %ymm0, %ymm27
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm27 = ymm27 ^ (ymm23 & (ymm27 ^ ymm3))
+; AVX512-NEXT: vmovdqa %ymm15, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm12 ^ ymm18))
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm23) | ymm0
-; AVX512-NEXT: vmovdqa %ymm14, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
-; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm30
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm1 & (ymm30 ^ ymm2))
-; AVX512-NEXT: vmovdqa %ymm4, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm28 ^ ymm17))
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
-; AVX512-NEXT: vmovdqa %ymm7, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm16 ^ ymm18))
-; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb %ymm5, %ymm0, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm0
-; AVX512-NEXT: vmovdqa %ymm9, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
-; AVX512-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm23
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm23 = ymm23 ^ (ymm1 & (ymm23 ^ ymm2))
-; AVX512-NEXT: vmovdqa %ymm7, %ymm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12))
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm26 ^ (ymm4 & (ymm27 ^ ymm26))
-; AVX512-NEXT: vmovdqa %ymm14, %ymm2
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (ymm2 & (ymm13 ^ ymm12))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512-NEXT: vporq %xmm3, %xmm2, %xmm23
+; AVX512-NEXT: vmovdqa %ymm8, %ymm11
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm13 ^ ymm10))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm24 = ymm20 ^ (ymm24 & (ymm21 ^ ymm20))
+; AVX512-NEXT: vmovdqa %ymm9, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm12 ^ ymm18))
; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u]
; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa %ymm9, %ymm3
-; AVX512-NEXT: vmovdqa %ymm9, %ymm15
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm13 ^ ymm12))
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm31 ^ ymm19))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
-; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm9
-; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[4,11],zero,zero,xmm9[0,7,14,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm6, %xmm9, %xmm6
-; AVX512-NEXT: vmovdqa %ymm14, %ymm12
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm16 = ymm18 ^ (ymm14 & (ymm16 ^ ymm18))
-; AVX512-NEXT: vmovdqa %ymm7, %ymm9
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm19 ^ ymm31))
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm19 ^ ymm31))
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1,2,3],ymm8[4],ymm3[5,6],ymm8[7,8],ymm3[9,10,11],ymm8[12],ymm3[13,14],ymm8[15]
+; AVX512-NEXT: vporq %xmm3, %xmm2, %xmm20
+; AVX512-NEXT: vmovdqa %ymm9, %ymm14
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm17 = ymm19 ^ (ymm9 & (ymm17 ^ ymm19))
+; AVX512-NEXT: vmovdqa %ymm15, %ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm10 ^ ymm13))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm13 ^ (ymm9 & (ymm10 ^ ymm13))
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2,3],ymm3[4],ymm11[5,6],ymm3[7,8],ymm11[9,10,11],ymm3[12],ymm11[13,14],ymm3[15]
+; AVX512-NEXT: vmovdqa %ymm8, %ymm13
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm18 ^ (ymm8 & (ymm12 ^ ymm18))
; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vmovdqa %ymm11, %ymm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm11)
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm2 & ymm1)
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm8[1],ymm14[2,3,4],ymm8[5],ymm14[6,7,8],ymm8[9],ymm14[10,11,12],ymm8[13],ymm14[14,15]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm1)
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm29 ^ (ymm12 & (ymm21 ^ ymm29))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
-; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm2
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm2
-; AVX512-NEXT: vmovdqa %xmm10, %xmm13
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512-NEXT: vpor %xmm2, %xmm6, %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm23 = (ymm23 & ymm28) | ymm3
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm20 = (ymm20 & ymm28) | ymm2
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm9[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1],ymm9[2,3,4],ymm2[5],ymm9[6,7,8],ymm2[9],ymm9[10,11,12],ymm2[13],ymm9[14,15]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm8, %xmm10
+; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm28) | ymm2
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm30 ^ ymm31))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm14[u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u]
+; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm3
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm3
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm1[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512-NEXT: vpor %xmm3, %xmm8, %xmm3
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm1 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm1 & (ymm2 ^ ymm0))
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm28 = ymm17 ^ (ymm7 & (ymm28 ^ ymm17))
-; AVX512-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm29 ^ (ymm15 & (ymm21 ^ ymm29))
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm21 ^ (ymm7 & (ymm29 ^ ymm21))
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm10, %xmm8, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpmovsxdq {{.*#+}} ymm5 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm5 & (ymm3 ^ ymm2))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm16 = ymm6 ^ (ymm15 & (ymm16 ^ ymm6))
+; AVX512-NEXT: vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm30 ^ ymm31))
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm15 = ymm30 ^ (ymm15 & (ymm31 ^ ymm30))
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm14[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm3))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u]
-; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm6
-; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm6, %xmm3
-; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm6
-; AVX512-NEXT: vmovdqa %xmm12, %xmm15
-; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512-NEXT: vpor %xmm6, %xmm12, %xmm6
+; AVX512-NEXT: vpshufb %xmm9, %xmm14, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm8
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm2 & (zmm8 ^ zmm23))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u]
+; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm11
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm11, %xmm3
+; AVX512-NEXT: vmovdqa64 %ymm29, %ymm6
+; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm11
+; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512-NEXT: vpor %xmm12, %xmm11, %xmm11
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm1 & (ymm6 ^ ymm3))
-; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm3
+; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 ^ (ymm5 & (ymm11 ^ ymm3))
+; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm3
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u]
-; AVX512-NEXT: vpor %xmm3, %xmm7, %xmm3
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT: vpshufb %xmm7, %xmm15, %xmm12
-; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512-NEXT: vpor %xmm13, %xmm12, %xmm12
+; AVX512-NEXT: vpshufb {{.*#+}} xmm12 = xmm15[u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u]
+; AVX512-NEXT: vpor %xmm3, %xmm12, %xmm3
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpshufb %xmm12, %xmm4, %xmm6
+; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512-NEXT: vpor %xmm4, %xmm6, %xmm4
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm1 & (ymm12 ^ ymm3))
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm8[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm11))
-; AVX512-NEXT: vpshufb %xmm10, %xmm14, %xmm6
-; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
-; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm0 & (zmm6 ^ zmm9))
-; AVX512-NEXT: vinserti64x4 $1, %ymm30, %zmm0, %zmm0
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm5 & (ymm4 ^ ymm3))
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm14[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm2 & (zmm3 ^ zmm20))
+; AVX512-NEXT: vpshufb %xmm9, %xmm0, %xmm6
+; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm14[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm2 & (zmm4 ^ zmm10))
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm2
; AVX512-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
-; AVX512-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1}
-; AVX512-NEXT: vmovdqa64 %ymm28, %ymm1
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm28[2,3,0,1]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT: vpshufb %ymm7, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm16, %ymm1
-; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
-; AVX512-NEXT: vextracti32x4 $1, %ymm16, %xmm1
+; AVX512-NEXT: vmovdqa32 %zmm2, %zmm8 {%k1}
+; AVX512-NEXT: vinserti64x4 $1, %ymm27, %zmm0, %zmm2
+; AVX512-NEXT: vmovdqa32 %zmm2, %zmm3 {%k1}
+; AVX512-NEXT: vmovdqa64 %ymm16, %ymm0
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm16[2,3,0,1]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
+; AVX512-NEXT: vpshufb %ymm12, %ymm2, %ymm2
+; AVX512-NEXT: vmovdqa64 %ymm17, %ymm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u]
+; AVX512-NEXT: vextracti32x4 $1, %ymm17, %xmm1
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
; AVX512-NEXT: vpor %xmm5, %xmm1, %xmm1
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero
-; AVX512-NEXT: vpor %xmm0, %xmm4, %xmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm2
+; AVX512-NEXT: vmovdqa64 %ymm24, %ymm0
+; AVX512-NEXT: vextracti32x4 $1, %ymm24, %xmm2
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero
+; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1}
-; AVX512-NEXT: vmovdqa64 %zmm20, (%rsi)
-; AVX512-NEXT: vmovdqa64 %zmm22, (%rdx)
-; AVX512-NEXT: vmovdqa64 %zmm24, (%rcx)
+; AVX512-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1}
+; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512-NEXT: vmovdqa64 %zmm26, (%rdx)
+; AVX512-NEXT: vmovdqa64 %zmm22, (%rcx)
; AVX512-NEXT: vmovdqa64 %zmm25, (%r8)
-; AVX512-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512-NEXT: vmovdqa64 %zmm8, (%r9)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovdqa64 %zmm6, (%rax)
+; AVX512-NEXT: vmovdqa64 %zmm4, (%rax)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i8_stride7_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm20
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm20 ^ ymm12))
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %ymm31
+; AVX512-FCP-NEXT: vmovdqa64 32(%rdi), %ymm29
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm29 ^ (ymm0 & (ymm31 ^ ymm29))
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm31
-; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm27 ^ (ymm1 & (ymm31 ^ ymm27))
-; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm6
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7,8,9],ymm6[10],ymm1[11,12],ymm6[13],ymm1[14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28
-; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm30
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm28 ^ (ymm0 & (ymm30 ^ ymm28))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,2,4,6]
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm3
-; AVX512-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm1
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm12
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm10 ^ (ymm2 & (ymm12 ^ ymm10))
+; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem)
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26
+; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm27
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm26 ^ (ymm1 & (ymm27 ^ ymm26))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm4, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
+; AVX512-FCP-NEXT: vpermd %ymm14, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
; AVX512-FCP-NEXT: vmovdqa 240(%rdi), %xmm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm5
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm2, %xmm7, %xmm2
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm8
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm21 & (zmm8 ^ zmm1))
-; AVX512-FCP-NEXT: vmovdqa64 288(%rdi), %ymm16
-; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm11
-; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm11 ^ ymm16))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u]
-; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm14
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %xmm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm1, %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm19 & (zmm5 ^ zmm2))
+; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm15
+; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm1 ^ ymm15))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
+; AVX512-FCP-NEXT: vpor %xmm2, %xmm9, %xmm9
+; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm30
; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm2
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm14 ^ (ymm7 & (ymm2 ^ ymm14))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm7[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7,8,9],ymm13[10],ymm7[11,12,13],ymm13[14],ymm7[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm0 & ymm26)
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa64 416(%rdi), %ymm17
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm11
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm30 ^ (ymm11 & (ymm2 ^ ymm30))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4,5],ymm13[6],ymm11[7,8,9],ymm13[10],ymm11[11,12,13],ymm13[14],ymm11[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm9 & ymm25)
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa64 416(%rdi), %ymm16
; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %ymm18
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm15
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm7[4,11],zero,zero
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm15
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm23 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 ^ (ymm23 & (ymm15 ^ ymm13))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ zmm8))
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm8 & (ymm20 ^ ymm12))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm13
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm31 ^ (ymm13 & (ymm27 ^ ymm31))
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1],ymm6[2],ymm13[3,4,5],ymm6[6],ymm13[7,8,9],ymm6[10],ymm13[11,12,13],ymm6[14],ymm13[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ~mem)
-; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm8
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm28 ^ (ymm8 & (ymm30 ^ ymm28))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[4,11],zero,zero,xmm8[0,7,14,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,4,6]
-; AVX512-FCP-NEXT: vpermd %ymm3, %ymm15, %ymm15
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm15[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm8, %zmm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm21 & (zmm7 ^ zmm13))
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm11 ^ (ymm8 & (ymm16 ^ ymm11))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u]
-; AVX512-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm13
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26)
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm8
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10]
-; AVX512-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm23 & (ymm8 ^ ymm13))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7))
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm20 ^ ymm12))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm9
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm16 ^ (ymm9 & (ymm18 ^ ymm16))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm9[4,11],zero,zero
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm20 & (ymm7 ^ ymm11))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm7
+; AVX512-FCP-NEXT: vpmovsxwd {{.*#+}} zmm28 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm28 & (zmm7 ^ zmm5))
+; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm29 ^ (ymm5 & (ymm31 ^ ymm29))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm10 ^ ymm12))
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm3[2],ymm7[3,4,5],ymm3[6],ymm7[7,8,9],ymm3[10],ymm7[11,12,13],ymm3[14],ymm7[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm5 & ~mem)
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm26 ^ (ymm5 & (ymm27 ^ ymm26))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,3,4,6]
+; AVX512-FCP-NEXT: vpermd %ymm14, %ymm9, %ymm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm19 & (zmm5 ^ zmm7))
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm1 ^ (ymm7 & (ymm15 ^ ymm1))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm9
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm30 ^ (ymm9 & (ymm2 ^ ymm30))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8,9,10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm7 & ymm25)
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm16 ^ (ymm7 & (ymm18 ^ ymm16))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero
; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7,8,9,10],ymm6[11],ymm8[12,13],ymm6[14],ymm8[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm25)
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm13
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[1,8,15,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm11, %xmm7
; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,5,6]
-; AVX512-FCP-NEXT: vpermd %ymm3, %ymm13, %ymm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3
-; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm8 ^ (zmm22 & (zmm3 ^ zmm8))
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm11 ^ (ymm7 & (ymm16 ^ ymm11))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm20 & (ymm7 ^ ymm9))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm23
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm28 & (zmm23 ^ zmm5))
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm29 ^ (ymm5 & (ymm31 ^ ymm29))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm10 ^ ymm12))
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8,9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm22 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm5 & ymm22)
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm27 ^ (ymm5 & (ymm26 ^ ymm27))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[1,8,15,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,3,5,6]
+; AVX512-FCP-NEXT: vpermd %ymm14, %ymm9, %ymm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm21 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm7 ^ (zmm21 & (zmm5 ^ zmm7))
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm1 ^ (ymm7 & (ymm15 ^ ymm1))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm8
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (ymm8 & (ymm14 ^ ymm2))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm8[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm8[1,2],ymm13[3],ymm8[4,5,6],ymm13[7,8],ymm8[9,10],ymm13[11],ymm8[12,13,14],ymm13[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ymm26)
-; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm9, %xmm7
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm9
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm2 ^ (ymm9 & (ymm30 ^ ymm2))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2],ymm11[3],ymm9[4,5,6],ymm11[7,8],ymm9[9,10],ymm11[11],ymm9[12,13,14],ymm11[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm7 & ymm25)
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm16 ^ (ymm7 & (ymm18 ^ ymm16))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero
; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm11, %xmm7
; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm23 & (ymm7 ^ ymm8))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm21
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm29 & (zmm21 ^ zmm3))
-; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm20 ^ ymm12))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31))
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5,6],ymm6[7,8],ymm7[9,10],ymm6[11],ymm7[12,13,14],ymm6[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm25)
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm20 & (ymm7 ^ ymm9))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm19
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm28 & (zmm19 ^ zmm5))
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm29 ^ (ymm5 & (ymm31 ^ ymm29))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm10 ^ ymm12))
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm7[1,2],ymm3[3],ymm7[4,5,6],ymm3[7,8],ymm7[9,10],ymm3[11],ymm7[12,13,14],ymm3[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm5 & ymm22)
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm27 ^ (ymm3 & (ymm26 ^ ymm27))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm0
-; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm15
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm3))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm5
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa 208(%rdi), %xmm3
+; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm3, %xmm11
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm14
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm24 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm24 & (ymm3 ^ ymm5))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[3,10],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm7 ^ (zmm22 & (zmm0 ^ zmm7))
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6],ymm4[7,8],ymm3[9,10,11],ymm4[12],ymm3[13,14],ymm4[15]
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm4
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm11 ^ (ymm4 & (ymm16 ^ ymm11))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u,6,13],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[4,11],zero,zero,xmm4[0,7,14,u,u]
; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[5,12]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
-; AVX512-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm23 & (ymm3 ^ ymm4))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm0))
-; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm14 ^ ymm2))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6,7,8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16))
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm21 & (zmm3 ^ zmm7))
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm4
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm2 ^ (ymm4 & (ymm30 ^ ymm2))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm5 & (ymm15 ^ ymm1))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u]
+; AVX512-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm25) | ymm4
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm4
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm18 ^ (ymm4 & (ymm16 ^ ymm18))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero
+; AVX512-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm20 & (ymm4 ^ ymm5))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm21
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm28 & (zmm21 ^ zmm3))
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm30 ^ ymm2))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm28
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm4
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm15 ^ (ymm4 & (ymm1 ^ ymm15))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u]
+; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm25) | ymm3
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm16 ^ ymm18))
; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
-; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm26) | ymm0
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
-; AVX512-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm29
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm23 & (ymm29 ^ ymm3))
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm14 ^ (ymm0 & (ymm2 ^ ymm14))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15]
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm11 ^ ymm16))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero
+; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm20 & (ymm4 ^ ymm5))
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm2 ^ ymm30))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7,8],ymm5[9],ymm3[10,11,12],ymm5[13],ymm3[14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm15 ^ (ymm5 & (ymm1 ^ ymm15))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ~mem) | ymm3
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm16 ^ ymm18))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14]
; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm0
-; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm18 ^ (ymm0 & (ymm17 ^ ymm18))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
+; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm25
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm25 = ymm25 ^ (ymm20 & (ymm25 ^ ymm5))
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm12 ^ (ymm5 & (ymm10 ^ ymm12))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm17 = ymm16 ^ (ymm17 & (ymm18 ^ ymm16))
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm9
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm11
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm12 ^ ymm10))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm15 ^ (ymm8 & (ymm1 ^ ymm15))
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm10 ^ (ymm8 & (ymm12 ^ ymm10))
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm29 ^ ymm31))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm10
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm10, %xmm10
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm31 ^ (ymm3 & (ymm29 ^ ymm31))
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm12
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm31 ^ (ymm0 & (ymm29 ^ ymm31))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm7, %xmm3, %xmm7
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4],ymm5[5,6],ymm3[7,8],ymm5[9,10,11],ymm3[12],ymm5[13,14],ymm3[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm22) | ymm3
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7,8],ymm3[9],ymm11[10,11],ymm3[12],ymm11[13,14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (ymm7 & ymm22) | ymm3
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm8[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm8[0],ymm3[1],ymm8[2,3,4],ymm3[5],ymm8[6,7,8],ymm3[9],ymm8[10,11,12],ymm3[13],ymm8[14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
-; AVX512-FCP-NEXT: vpor %xmm7, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm26
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm23 & (ymm26 ^ ymm3))
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm13
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm20 ^ (ymm13 & (ymm12 ^ ymm20))
-; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17))
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm20 ^ (ymm0 & (ymm12 ^ ymm20))
-; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm8
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm20 ^ (ymm10 & (ymm12 ^ ymm20))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm12, %xmm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm12
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm0, %xmm12, %xmm0
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm12
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm16 ^ (ymm9 & (ymm11 ^ ymm16))
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm13
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm27 ^ (ymm13 & (ymm31 ^ ymm27))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm10[6,13],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[4,11],zero,zero,xmm10[0,7,14,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm4, %xmm10, %xmm4
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6],ymm6[7,8],ymm7[9,10,11],ymm6[12],ymm7[13,14],ymm6[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm3 & ymm25)
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5,6,7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13,14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm0 & ymm25)
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7,8],ymm6[9],ymm9[10,11,12],ymm6[13],ymm9[14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm25)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm1 & (ymm2 ^ ymm14))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm30 ^ (ymm8 & (ymm28 ^ ymm30))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm5, %xmm0, %xmm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm22) | ymm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm13 & (ymm2 ^ ymm30))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm26 ^ ymm27))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm27 ^ (ymm12 & (ymm26 ^ ymm27))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm26 ^ (ymm13 & (ymm27 ^ ymm26))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm3
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm12, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm3
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm9 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm9 & (ymm3 ^ ymm0))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm24 & (ymm3 ^ ymm0))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm0, %xmm8, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512-FCP-NEXT: vpor %xmm6, %xmm8, %xmm6
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm9 & (ymm4 ^ ymm0))
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm24 & (ymm6 ^ ymm0))
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,11],zero,zero,xmm1[0,7,14],zero,zero,xmm1[u,u,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm0, %xmm8, %xmm8
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm12, %xmm5
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm9 & (ymm5 ^ ymm1))
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,2,4,6,0,0,0,0]
-; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm8
-; AVX512-FCP-NEXT: vpermd %ymm8, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm3, %zmm1
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm10))
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,4,6,0,0,0,0]
-; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm9
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm4, %zmm4
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm3 & (zmm4 ^ zmm7))
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm24 & (ymm9 ^ ymm8))
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,2,4,6,0,0,0,0]
+; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm11
+; AVX512-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm8 & (zmm3 ^ zmm10))
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,4,6,0,0,0,0]
+; AVX512-FCP-NEXT: vpermd %ymm11, %ymm10, %ymm10
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm6, %zmm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm8 & (zmm6 ^ zmm7))
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,6,0,0,0,0]
-; AVX512-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vpermd %ymm11, %ymm7, %ymm7
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm5, %zmm5
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm6))
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm29, %zmm0, %zmm3
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm9, %zmm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm8 & (zmm7 ^ zmm5))
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
; AVX512-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1}
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm26, %zmm0, %zmm3
-; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1}
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1}
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm25, %zmm0, %zmm4
+; AVX512-FCP-NEXT: vmovdqa32 %zmm4, %zmm6 {%k1}
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,u,u,2,9],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[u,u,u]
-; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u]
-; AVX512-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & mem) | ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm3
-; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
+; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
+; AVX512-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm2
+; AVX512-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm0
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero
-; AVX512-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero
+; AVX512-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm5 {%k1}
+; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm7 {%k1}
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm0, (%rsi)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%rdx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%r9)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i8_stride7_vf64:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: subq $24, %rsp
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm12
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm13
-; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %ymm31
-; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm23
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm13 ^ (ymm1 & (ymm12 ^ ymm13))
+; AVX512DQ-NEXT: subq $56, %rsp
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm27 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %ymm25
+; AVX512DQ-NEXT: vmovdqa64 32(%rdi), %ymm18
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm13
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm18 ^ (ymm1 & (ymm25 ^ ymm18))
; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT: vmovdqa64 96(%rdi), %ymm28
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm28 ^ ymm31))
-; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm11[2],ymm2[3,4],ymm11[5],ymm2[6,7,8,9],ymm11[10],ymm2[11,12],ymm11[13],ymm2[14,15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem)
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm25
-; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm4
-; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm25 ^ (ymm1 & (ymm4 ^ ymm25))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm10
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm10 ^ ymm13))
+; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm7
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm1 & mem)
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm30
+; AVX512DQ-NEXT: vmovdqa64 160(%rdi), %ymm31
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm30 ^ (ymm1 & (ymm31 ^ ymm30))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm0
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm30
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm20
-; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm10
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm23
+; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm26
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm5
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm6, %zmm1, %zmm22
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm19 & (zmm22 ^ zmm2))
-; AVX512DQ-NEXT: vmovdqa64 288(%rdi), %ymm18
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm2
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm22
+; AVX512DQ-NEXT: vpor %xmm11, %xmm14, %xmm11
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm1, %zmm14
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm4 & (zmm14 ^ zmm6))
+; AVX512DQ-NEXT: vmovdqa64 288(%rdi), %ymm19
; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %ymm17
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
-; AVX512DQ-NEXT: vpor %xmm6, %xmm2, %xmm2
-; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %ymm21
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm6
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm19 ^ (ymm6 & (ymm17 ^ ymm19))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,3,10],zero,zero,zero,xmm6[6,13],zero,zero,xmm6[u,u]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm6
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u],zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,11,u,u]
+; AVX512DQ-NEXT: vpor %xmm6, %xmm15, %xmm15
+; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm12
; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %ymm16
-; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm7
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm21 ^ (ymm7 & (ymm16 ^ ymm21))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4,5],ymm8[6],ymm7[7,8,9],ymm8[10],ymm7[11,12,13],ymm8[14],ymm7[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm9
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm16 ^ ymm12))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm9[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1],ymm2[2],ymm9[3,4,5],ymm2[6],ymm9[7,8,9],ymm2[10],ymm9[11,12,13],ymm2[14],ymm9[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm24 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm2 & ymm24)
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm12 ^ ymm13))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm15
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm15 & ymm24)
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512DQ-NEXT: vmovdqa64 416(%rdi), %ymm20
+; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %ymm21
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm9
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm20 ^ (ymm9 & (ymm21 ^ ymm20))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm11
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[2,9]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm9[4,11],zero,zero
+; AVX512DQ-NEXT: vpor %xmm11, %xmm9, %xmm9
+; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm6 & (ymm9 ^ ymm2))
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm29 & (zmm0 ^ zmm14))
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm25 ^ ymm18))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm9
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm2, %xmm15, %xmm2
-; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm15
-; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm29
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm28 ^ (ymm15 & (ymm31 ^ ymm28))
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm11[2],ymm15[3,4,5],ymm11[6],ymm15[7,8,9],ymm11[10],ymm15[11,12,13],ymm11[14],ymm15[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm15 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm15[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 | (ymm2 & ~mem)
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2
-; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm6
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm4 ^ ymm25))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm2, %xmm9, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm9
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm10 ^ (ymm9 & (ymm13 ^ ymm10))
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm7[2],ymm9[3,4,5],ymm7[6],ymm9[7,8,9],ymm7[10],ymm9[11,12,13],ymm7[14],ymm9[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm2 & ~mem)
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm2 & (ymm31 ^ ymm30))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX512DQ-NEXT: vmovdqa %xmm3, %xmm4
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm26
-; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm10[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm1
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm23
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm19 & (zmm23 ^ zmm15))
-; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[2,9],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28))
-; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm11[3],ymm2[4,5],ymm11[6],ymm2[7,8,9,10],ymm11[11],ymm2[12,13],ymm11[14],ymm2[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm27 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm27)
-; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm15
-; AVX512DQ-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm6 ^ (ymm0 & (ymm25 ^ ymm6))
-; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm19
-; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm10, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm6
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-NEXT: vpor %xmm2, %xmm11, %xmm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm25
-; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm20 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm2 ^ (zmm20 & (zmm25 ^ zmm2))
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm13 ^ (ymm0 & (ymm12 ^ ymm13))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28))
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm2[1,2],ymm11[3],ymm2[4,5,6],ymm11[7,8],ymm2[9,10],ymm11[11],ymm2[12,13,14],ymm11[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm27)
-; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm11
-; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm19 ^ (ymm0 & (ymm15 ^ ymm19))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm5
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-NEXT: vpor %xmm5, %xmm15, %xmm5
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (mem & (ymm5 ^ ymm0))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm0, %xmm15, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 416(%rdi), %ymm26
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm5, %zmm30
-; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %ymm27
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm3 ^ (zmm20 & (zmm30 ^ zmm3))
-; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero
-; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm29 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm8))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20
-; AVX512DQ-NEXT: vpmovsxwd {{.*#+}} zmm8 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm8 & (zmm20 ^ zmm22))
-; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm18 ^ ymm17))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm21 ^ (ymm3 & (ymm16 ^ ymm21))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm24)
-; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm0[5,12],zero,zero
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[0,7,14],zero,zero,xmm0[3,10]
-; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm3))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm22
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm8 & (zmm22 ^ zmm23))
-; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm17 ^ (ymm0 & (ymm18 ^ ymm17))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm21 ^ ymm16))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2],ymm5[3],ymm3[4,5,6],ymm5[7,8],ymm3[9,10],ymm5[11],ymm3[12,13,14],ymm5[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm28
+; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm11, %xmm14, %xmm11
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm2, %zmm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm4 & (zmm2 ^ zmm9))
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm9
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm17 ^ (ymm9 & (ymm19 ^ ymm17))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm11
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u]
+; AVX512DQ-NEXT: vpor %xmm11, %xmm9, %xmm9
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm11
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm12 ^ (ymm11 & (ymm16 ^ ymm12))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm14[3],ymm11[4,5],ymm14[6],ymm11[7,8,9,10],ymm14[11],ymm11[12,13],ymm14[14],ymm11[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm9 & ymm24)
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm9
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm20 ^ (ymm9 & (ymm21 ^ ymm20))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero
+; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm9
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10]
+; AVX512DQ-NEXT: vpor %xmm14, %xmm9, %xmm9
+; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm6 & (ymm9 ^ ymm11))
+; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm26
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm4
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm29 & (zmm4 ^ zmm2))
+; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm25 ^ ymm18))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm2, %xmm9, %xmm2
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm9
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm10 ^ (ymm9 & (ymm13 ^ ymm10))
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8,9,10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm22 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm2 & ymm22)
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm30 ^ ymm31))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm11
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,zero,xmm11[5,12],zero,zero,xmm11[1,8,15,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm2, %xmm11, %xmm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm4
+; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa %xmm3, %xmm6
+; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm11, %xmm14, %xmm11
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm11, %zmm2, %zmm2
+; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} zmm11 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm9 ^ (zmm11 & (zmm2 ^ zmm9))
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm9
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm17 ^ (ymm9 & (ymm19 ^ ymm17))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm14
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[6,13,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u]
+; AVX512DQ-NEXT: vpor %xmm14, %xmm9, %xmm9
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm14
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm16 ^ (ymm14 & (ymm12 ^ ymm16))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm14[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1,2],ymm3[3],ymm14[4,5,6],ymm3[7,8],ymm14[9,10],ymm3[11],ymm14[12,13,14],ymm3[15]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm0 & ymm24)
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm26 ^ (ymm0 & (ymm27 ^ ymm26))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11]
-; AVX512DQ-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm3))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm23
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm8 & (zmm23 ^ zmm25))
-; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm21 ^ ymm16))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3],ymm3[4],ymm0[5,6],ymm3[7,8],ymm0[9,10,11],ymm3[12],ymm0[13,14],ymm3[15]
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm17 ^ (ymm2 & (ymm18 ^ ymm17))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm9 & ymm24)
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm9
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm20 ^ (ymm9 & (ymm21 ^ ymm20))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm9[6,13],zero,zero
+; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm9
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11]
+; AVX512DQ-NEXT: vpor %xmm14, %xmm9, %xmm9
+; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm26 & (ymm9 ^ ymm3))
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm23
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm29 & (zmm23 ^ zmm2))
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm25 ^ ymm18))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm24) | ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,4,11],zero,zero,xmm0[0,7,14],zero,zero
-; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm29 & (ymm0 ^ ymm2))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm25
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm8 & (zmm25 ^ zmm30))
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm16 ^ (ymm0 & (ymm21 ^ ymm16))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18))
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm10 ^ (ymm3 & (ymm13 ^ ymm10))
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1,2],ymm7[3],ymm3[4,5,6],ymm7[7,8],ymm3[9,10],ymm7[11],ymm3[12,13,14],ymm7[15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm22)
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm31 ^ (ymm2 & (ymm30 ^ ymm31))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm7, %xmm2, %xmm2
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT: vpshufb %xmm7, %xmm4, %xmm9
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm1[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-NEXT: vpor %xmm9, %xmm14, %xmm9
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm28 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm28 & (ymm9 ^ ymm2))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[3,10],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm0[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vmovdqa %xmm0, %xmm6
+; AVX512DQ-NEXT: vpor %xmm2, %xmm14, %xmm2
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm9, %zmm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm3 ^ (zmm11 & (zmm2 ^ zmm3))
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm16 ^ (ymm3 & (ymm12 ^ ymm16))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm3[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm3[1,2,3],ymm9[4],ymm3[5,6],ymm9[7,8],ymm3[9,10,11],ymm9[12],ymm3[13,14],ymm9[15]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm3
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm7
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm19 ^ ymm17))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[u,u,u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm7
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u]
+; AVX512DQ-NEXT: vpor %xmm7, %xmm9, %xmm7
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = (ymm7 & ymm24) | ymm3
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm21 ^ (ymm3 & (ymm20 ^ ymm21))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm9
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
+; AVX512DQ-NEXT: vpor %xmm3, %xmm9, %xmm3
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm26 & (ymm3 ^ ymm7))
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm22
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm2))
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm12 ^ ymm16))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm17 ^ ymm19))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
+; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ymm24) | ymm2
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm20 ^ ymm21))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[6,13]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
+; AVX512DQ-NEXT: vpor %xmm7, %xmm2, %xmm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm26 & (ymm4 ^ ymm3))
+; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm12 ^ (ymm2 & (ymm16 ^ ymm12))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm19 ^ (ymm3 & (ymm17 ^ ymm19))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm7
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
+; AVX512DQ-NEXT: vpor %xmm7, %xmm3, %xmm3
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm2
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm26
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = (ymm3 & ~mem) | ymm2
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm21 ^ (ymm2 & (ymm20 ^ ymm21))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
+; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
+; AVX512DQ-NEXT: vpor %xmm7, %xmm2, %xmm2
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm0 & (ymm7 ^ ymm3))
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm18 ^ ymm25))
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ymm24) | ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero
-; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm24
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm24 = ymm24 ^ (ymm29 & (ymm24 ^ ymm2))
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm21 ^ (ymm0 & (ymm16 ^ ymm21))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7,8],ymm2[9],ymm0[10,11,12],ymm2[13],ymm0[14,15]
-; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm2
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm18 ^ (ymm2 & (ymm17 ^ ymm18))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vporq %xmm3, %xmm2, %xmm24
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm11
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm13 ^ ymm10))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm27 = ymm20 ^ (ymm27 & (ymm21 ^ ymm20))
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm25 ^ (ymm2 & (ymm18 ^ ymm25))
; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm27 ^ (ymm0 & (ymm26 ^ ymm27))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm30
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm29 & (ymm30 ^ ymm2))
-; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,11],zero,zero,xmm0[0,7,14],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vporq %xmm2, %xmm0, %xmm29
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm19 = ymm26 ^ (ymm19 & (ymm27 ^ ymm26))
-; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm13 ^ ymm12))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2
-; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm15
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm13 ^ ymm12))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm28 ^ (ymm2 & (ymm31 ^ ymm28))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[6,13],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm5
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3
-; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm5
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm17 = ymm18 ^ (ymm14 & (ymm17 ^ ymm18))
-; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm9
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm31 ^ (ymm9 & (ymm28 ^ ymm31))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm28 ^ ymm31))
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2,3],ymm4[4],ymm2[5,6],ymm4[7,8],ymm2[9,10,11],ymm4[12],ymm2[13,14],ymm4[15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vmovdqa %ymm11, %ymm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 | (ymm29 & ymm11)
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm4[1],ymm9[2,3],ymm4[4],ymm9[5,6,7,8],ymm4[9],ymm9[10,11],ymm4[12],ymm9[13,14,15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm0 & ymm1)
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm4[1],ymm14[2,3,4],ymm4[5],ymm14[6,7,8],ymm4[9],ymm14[10,11,12],ymm4[13],ymm14[14,15]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm3 & ymm1)
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm6 ^ (ymm5 & (ymm13 ^ ymm6))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm2
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vporq %xmm3, %xmm2, %xmm20
+; AVX512DQ-NEXT: vmovdqa %ymm5, %ymm14
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm17 = ymm19 ^ (ymm5 & (ymm17 ^ ymm19))
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm13 ^ (ymm2 & (ymm10 ^ ymm13))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm13 ^ (ymm5 & (ymm10 ^ ymm13))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm11[1,2,3],ymm3[4],ymm11[5,6],ymm3[7,8],ymm11[9,10,11],ymm3[12],ymm11[13,14],ymm3[15]
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm11
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm25 ^ (ymm8 & (ymm18 ^ ymm25))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm1 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm24 = (ymm24 & ymm1) | ymm3
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm20 = (ymm20 & ymm1) | ymm2
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm5[2,3,0,1]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7,8],ymm2[9],ymm5[10,11,12],ymm2[13],ymm5[14,15]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[6,13],zero,zero,xmm8[2,9],zero,zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm10
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm1) | ymm0
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm14 = ymm31 ^ (ymm14 & (ymm30 ^ ymm31))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm2
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
; AVX512DQ-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm2
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm9, %xmm1, %xmm2
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpmovsxdq {{.*#+}} ymm18 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm18 & (ymm2 ^ ymm0))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm16 = ymm21 ^ (ymm7 & (ymm16 ^ ymm21))
-; AVX512DQ-NEXT: vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm6 ^ (ymm15 & (ymm13 ^ ymm6))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm13 ^ (ymm7 & (ymm6 ^ ymm13))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm28 & (ymm2 ^ ymm0))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm16 = ymm12 ^ (ymm15 & (ymm16 ^ ymm12))
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm12 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm11 = ymm31 ^ (ymm11 & (ymm30 ^ ymm31))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm15 = ymm30 ^ (ymm15 & (ymm31 ^ ymm30))
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm14, %xmm0
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm14, %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm2
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm8
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm12))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm15[u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u]
-; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm5
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm5, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm5
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm4[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-NEXT: vpor %xmm5, %xmm8, %xmm5
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm24))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm11[u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u]
+; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm3
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm3
+; AVX512DQ-NEXT: vmovdqa %xmm9, %xmm11
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-NEXT: vpor %xmm3, %xmm9, %xmm3
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm28 & (ymm3 ^ ymm2))
+; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm2
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u]
+; AVX512DQ-NEXT: vpor %xmm2, %xmm9, %xmm2
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT: vpshufb %xmm9, %xmm1, %xmm5
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-NEXT: vpor %xmm5, %xmm11, %xmm5
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm18 & (ymm5 ^ ymm3))
-; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,4,11],zero,zero,xmm7[0,7,14],zero,zero,xmm7[u,u,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm3, %xmm7, %xmm3
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm8
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-NEXT: vpor %xmm12, %xmm8, %xmm8
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm18 & (ymm8 ^ ymm3))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 ^ (ymm28 & (ymm5 ^ ymm2))
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm20))
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm6, %xmm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm5, %zmm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm11))
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm5
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm14[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm8, %zmm5
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm9))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ zmm10))
; AVX512DQ-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512DQ-NEXT: kmovw %eax, %k1
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm24, %zmm0, %zmm2 {%k1}
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm30, %zmm0, %zmm3 {%k1}
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm8 {%k1}
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm7, %zmm0, %zmm2 {%k1}
; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm1
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm16[2,3,0,1]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT: vpshufb %ymm7, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpshufb %ymm9, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm1
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
; AVX512DQ-NEXT: vextracti32x4 $1, %ymm17, %xmm1
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
-; AVX512DQ-NEXT: vpor %xmm6, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpor %xmm5, %xmm1, %xmm1
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm4
-; AVX512DQ-NEXT: vextracti32x4 $1, %ymm19, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm4
+; AVX512DQ-NEXT: vextracti32x4 $1, %ymm27, %xmm0
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero
; AVX512DQ-NEXT: vpor %xmm0, %xmm4, %xmm0
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm5 {%k1}
-; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rsi)
-; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%rdx)
+; AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
+; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT: vmovaps %zmm0, (%rsi)
+; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT: vmovaps %zmm0, (%rdx)
; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rcx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm25, (%r8)
-; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%r8)
+; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r9)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax)
-; AVX512DQ-NEXT: addq $24, %rsp
+; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-NEXT: addq $56, %rsp
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i8_stride7_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: pushq %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm19 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm12 ^ (ymm0 & (ymm11 ^ ymm12))
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm18 = [65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0]
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %ymm29
+; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rdi), %ymm31
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm31 ^ (ymm0 & (ymm29 ^ ymm31))
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm31
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm27 ^ (ymm2 & (ymm31 ^ ymm27))
+; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm10 ^ (ymm1 & (ymm12 ^ ymm10))
; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm3
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm1 & mem)
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm28
-; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm30
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm28 ^ (ymm1 & (ymm30 ^ ymm28))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[6,13,4,11,2,9,16,23,30,u],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm0 & mem)
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26
+; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm27
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm26 ^ (ymm2 & (ymm27 ^ ymm26))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,1,2,4,6]
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm22
-; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm4, %ymm4
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
; AVX512DQ-FCP-NEXT: vmovdqa 240(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm1, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm2))
-; AVX512DQ-FCP-NEXT: vmovdqa64 288(%rdi), %ymm16
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm2, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm19 & (zmm9 ^ zmm1))
+; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm15
; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm16 ^ (ymm2 & (ymm1 ^ ymm16))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm15 ^ (ymm2 & (ymm1 ^ ymm15))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,u,u,u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm11, %xmm11
+; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm30
; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm13
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm13[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0,1],ymm8[2],ymm13[3,4,5],ymm8[6],ymm13[7,8,9],ymm8[10],ymm13[11,12,13],ymm8[14],ymm13[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm10 & ymm26)
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa64 416(%rdi), %ymm17
-; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %ymm18
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm17 ^ (ymm13 & (ymm18 ^ ymm17))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm21 & (ymm13 ^ ymm8))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm29 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm29 & (zmm3 ^ zmm7))
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm13
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm30 ^ (ymm13 & (ymm2 ^ ymm30))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm13[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm13[0,1],ymm5[2],ymm13[3,4,5],ymm5[6],ymm13[7,8,9],ymm5[10],ymm13[11,12,13],ymm5[14],ymm13[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm25 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm11 & ymm25)
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [65535,65535,0,65535,65535,0,65535,65535,65535,0,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa64 416(%rdi), %ymm16
+; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %ymm17
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm11
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm16 ^ (ymm11 & (ymm17 ^ ymm16))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm11, %xmm8
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm20 & (ymm8 ^ ymm5))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm5
+; AVX512DQ-FCP-NEXT: vpmovsxwd {{.*#+}} zmm28 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,255,0,0,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm28 & (zmm5 ^ zmm9))
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm31 ^ (ymm5 & (ymm29 ^ ymm31))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm8
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[1,8,15],zero,zero,xmm7[4,11],zero,zero,xmm7[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm8
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[1,8,15],zero,zero,xmm5[4,11],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm8 & (ymm10 ^ ymm12))
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm3[2],ymm8[3,4,5],ymm3[6],ymm8[7,8,9],ymm3[10],ymm8[11,12,13],ymm3[14],ymm8[15]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm7 & ~mem)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm28 ^ (ymm7 & (ymm30 ^ ymm28))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm7[u,u,u,6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u],zero,zero,xmm7[4,11],zero,zero,xmm7[0,7,14,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,0,0,0,1,3,4,6]
-; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm13, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm15, %xmm13
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm7, %zmm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm8))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm8
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm16 ^ ymm1))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm13
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm5 & ~mem)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm26 ^ (ymm5 & (ymm27 ^ ymm26))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,3,4,6]
+; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[1,8,15],zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm19 & (zmm5 ^ zmm8))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm15 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm13, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm13
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm14 ^ (ymm13 & (ymm2 ^ ymm14))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7,8,9,10],ymm15[11],ymm13[12,13],ymm15[14],ymm13[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm8 & ymm26)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm17 ^ (ymm8 & (ymm18 ^ ymm17))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm9
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm30 ^ (ymm9 & (ymm2 ^ ymm30))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm9[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8,9,10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm9[3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm8 & ymm25)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm16 ^ (ymm8 & (ymm17 ^ ymm16))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10]
-; AVX512DQ-FCP-NEXT: vpor %xmm15, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm8, %xmm8
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm21 & (ymm8 ^ ymm13))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm24
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm29 & (zmm24 ^ zmm7))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm12 ^ (ymm7 & (ymm11 ^ ymm12))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm8
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm20 & (ymm8 ^ ymm9))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm23
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm28 & (zmm23 ^ zmm5))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm31 ^ (ymm5 & (ymm29 ^ ymm31))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm5[2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm12 ^ (ymm8 & (ymm10 ^ ymm12))
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2],ymm3[3],ymm8[4,5],ymm3[6],ymm8[7,8,9,10],ymm3[11],ymm8[12,13],ymm3[14],ymm8[15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm20, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm23 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm23)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[1,8,15,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero,xmm7[u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,5,6]
-; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm8, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm22 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm13 ^ (zmm22 & (zmm3 ^ zmm13))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm1 ^ (ymm7 & (ymm16 ^ ymm1))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm13
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[6,13,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm13, %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm2 ^ (ymm13 & (ymm14 ^ ymm2))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5,6],ymm15[7,8],ymm13[9,10],ymm15[11],ymm13[12,13,14],ymm15[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 | (ymm7 & ymm26)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm17 ^ (ymm7 & (ymm18 ^ ymm17))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm7[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm15, %xmm7
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm21 & (ymm7 ^ ymm13))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm29 & (zmm20 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm12 ^ (ymm3 & (ymm11 ^ ymm12))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm31 ^ (ymm7 & (ymm27 ^ ymm31))
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0],ymm7[1,2],ymm0[3],ymm7[4,5,6],ymm0[7,8],ymm7[9,10],ymm0[11],ymm7[12,13,14],ymm0[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm3 & ymm23)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm28 ^ ymm30))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm13
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[1,8,15,6,13,4,11,18,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm22 = [18446744073709551615,255,18446744073709486080,18446744073709551615]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm5 & ymm22)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm27 ^ (ymm5 & (ymm26 ^ ymm27))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm9
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[1,8,15,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm9, %xmm5
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,1,3,5,6]
+; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm4[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm11, %xmm9
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm5, %zmm9
+; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm21 = [0,0,18446744073709486080,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615,18446744073709551615]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm8 ^ (zmm21 & (zmm9 ^ zmm8))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm1 ^ (ymm8 & (ymm15 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm11
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,5,12],zero,zero,xmm8[1,8,15],zero,zero,xmm8[u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm11
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm2 ^ (ymm11 & (ymm30 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm11[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1,2],ymm14[3],ymm11[4,5,6],ymm14[7,8],ymm11[9,10],ymm14[11],ymm11[12,13,14],ymm14[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm8 & ymm25)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm16 ^ (ymm8 & (ymm17 ^ ymm16))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11]
+; AVX512DQ-FCP-NEXT: vpor %xmm14, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 ^ (ymm20 & (ymm8 ^ ymm11))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm0, %zmm19
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm28 & (zmm19 ^ zmm9))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm29 ^ ymm31))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm9
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm12 ^ (ymm9 & (ymm10 ^ ymm12))
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm9[1,2],ymm3[3],ymm9[4,5,6],ymm3[7,8],ymm9[9,10],ymm3[11],ymm9[12,13,14],ymm3[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,9,0,7,14,5,12,19,26],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm8 & ymm22)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm27 ^ (ymm3 & (ymm26 ^ ymm27))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[2,9,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm13, %xmm13
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm8
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa 208(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm3, %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm14
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm5
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm25 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm25 & (ymm3 ^ ymm8))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm24 = [18446744073709551615,18446744073709551615,18446744073709551615,16777215]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm24 & (ymm3 ^ ymm5))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[3,10],zero,zero,zero,xmm6[u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm7 ^ (zmm22 & (zmm3 ^ zmm7))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm2 ^ (ymm4 & (ymm14 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm9 ^ (zmm21 & (zmm3 ^ zmm9))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm2 ^ (ymm4 & (ymm30 ^ ymm2))
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3],ymm5[4],ymm4[5,6],ymm5[7,8],ymm4[9,10,11],ymm5[12],ymm4[13,14],ymm5[15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm5
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm5 & (ymm16 ^ ymm1))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm1 ^ (ymm5 & (ymm15 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,u,6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm26) | ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm18 ^ (ymm4 & (ymm17 ^ ymm18))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12]
+; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm25) | ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm17 ^ (ymm4 & (ymm16 ^ ymm17))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,u,u,u],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm21 & (ymm4 ^ ymm5))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm22
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm29 & (zmm22 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm14 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm20 & (ymm4 ^ ymm5))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm21
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm28 & (zmm21 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm2 ^ (ymm3 & (ymm30 ^ ymm2))
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16))
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm28
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm15 ^ (ymm4 & (ymm1 ^ ymm15))
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm26) | ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = (ymm4 & ymm25) | ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm16 ^ ymm17))
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero
; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm26
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm26 = ymm26 ^ (ymm21 & (ymm26 ^ ymm4))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm14 ^ (ymm3 & (ymm2 ^ ymm14))
+; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm25
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm25 = ymm25 ^ (ymm20 & (ymm25 ^ ymm4))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm30 ^ (ymm3 & (ymm2 ^ ymm30))
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm4
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm16 ^ (ymm4 & (ymm1 ^ ymm16))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm15 ^ (ymm4 & (ymm1 ^ ymm15))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,1,8,15],zero,zero,xmm4[4,11],zero,zero,xmm4[u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm4, %xmm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = (ymm7 & ~mem) | ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm18 ^ (ymm3 & (ymm17 ^ ymm18))
+; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm4, %xmm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ~mem) | ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm17 ^ (ymm3 & (ymm16 ^ ymm17))
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14]
; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vinserti32x4 $1, %xmm3, %ymm0, %ymm29
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm29 = ymm29 ^ (ymm21 & (ymm29 ^ ymm7))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm13
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm11 ^ (ymm13 & (ymm12 ^ ymm11))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm8
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm19 = ymm17 ^ (ymm19 & (ymm18 ^ ymm17))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm11 ^ (ymm3 & (ymm12 ^ ymm11))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm11 ^ (ymm6 & (ymm12 ^ ymm11))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm11
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm13[4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm12, %xmm13
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm31 ^ (ymm8 & (ymm27 ^ ymm31))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm11
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm4 = ymm4 ^ (ymm20 & (ymm4 ^ ymm5))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm12 ^ (ymm5 & (ymm10 ^ ymm12))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm18 = ymm16 ^ (ymm18 & (ymm17 ^ ymm16))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm11
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm10 ^ (ymm11 & (ymm12 ^ ymm10))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm15 ^ (ymm0 & (ymm1 ^ ymm15))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm10 ^ (ymm0 & (ymm12 ^ ymm10))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm29 ^ (ymm8 & (ymm31 ^ ymm29))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm10
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,11],zero,zero,xmm8[0,7,14],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm10, %xmm8, %xmm10
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm29 ^ (ymm3 & (ymm31 ^ ymm29))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm12
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm29 ^ (ymm7 & (ymm31 ^ ymm29))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm12
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm16 ^ (ymm9 & (ymm1 ^ ymm16))
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm15
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm27 ^ (ymm15 & (ymm31 ^ ymm27))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm31 ^ ymm27))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vporq %xmm11, %xmm0, %xmm16
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm8[1,2,3],ymm4[4],ymm8[5,6],ymm4[7,8],ymm8[9,10,11],ymm4[12],ymm8[13,14],ymm4[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm11 = ymm11 | (ymm13 & ymm23)
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm8 | (ymm3 & ymm23)
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm4[1],ymm9[2,3,4],ymm4[5],ymm9[6,7,8],ymm4[9],ymm9[10,11,12],ymm4[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2,3],ymm3[4],ymm5[5,6],ymm3[7,8],ymm5[9,10,11],ymm3[12],ymm5[13,14],ymm3[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[3,10,1,8,15,6,13,20,27],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = (ymm10 & ymm22) | ymm3
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm11[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm3[1],ymm11[2,3],ymm3[4],ymm11[5,6,7,8],ymm3[9],ymm11[10,11],ymm3[12],ymm11[13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[4,11,2,9,0,7,14,21,28],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = (ymm8 & ymm22) | ymm3
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7,8],ymm3[9],ymm0[10,11,12],ymm3[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[6,13],zero,zero,xmm7[2,9],zero,zero,zero,xmm7[u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm5, %xmm5
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[5,12,3,10,1,8,15,22,29],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm16 & ymm23)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm14 ^ (ymm10 & (ymm2 ^ ymm14))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm30 ^ (ymm12 & (ymm28 ^ ymm30))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm30 ^ (ymm7 & (ymm28 ^ ymm30))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm28 ^ (ymm10 & (ymm30 ^ ymm28))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm9
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm9, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm9
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm9, %xmm12, %xmm9
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm25 & (ymm9 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = (ymm5 & ymm22) | ymm0
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm30 ^ (ymm13 & (ymm2 ^ ymm30))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm27 ^ (ymm9 & (ymm26 ^ ymm27))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm27 ^ (ymm12 & (ymm26 ^ ymm27))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm26 ^ (ymm13 & (ymm27 ^ ymm26))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm7, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm25 & (ymm7 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm10[u,u,4,11],zero,zero,xmm10[0,7,14],zero,zero,xmm10[u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm10
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm6[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpor %xmm12, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm25 & (ymm10 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,2,4,6,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm9, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm9 & (zmm3 ^ zmm11))
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,4,6,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm11, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm7, %zmm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm9 & (zmm7 ^ zmm8))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm24 & (ymm3 ^ ymm0))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,3,10],zero,zero,zero,xmm12[6,13],zero,zero,xmm12[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm7, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm14[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm24 & (ymm6 ^ ymm0))
+; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[5,12,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm13[u,u,4,11],zero,zero,xmm13[0,7,14],zero,zero,xmm13[u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,1,8,15,22,29,20,27,18,25,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpor %xmm11, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm24 & (ymm9 ^ ymm7))
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,2,4,6,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm7 & (zmm3 ^ zmm10))
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,4,6,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm10, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm7 & (zmm6 ^ zmm8))
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,5,6,0,0,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm8
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm10, %zmm8
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm9 & (zmm8 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm9, %zmm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm7 & (zmm8 ^ zmm5))
; AVX512DQ-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm26, %zmm0, %zmm3 {%k1}
-; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm29, %zmm0, %zmm7 {%k1}
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm25, %zmm0, %zmm3 {%k1}
+; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm4, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u]
; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u],zero,zero,xmm1[0,7,14],zero,zero,xmm1[3,10,u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = (ymm1 & mem) | ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm2
-; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm2
+; AVX512DQ-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm0
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero
; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm2, %xmm0
@@ -13336,367 +13373,374 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm8 {%k1}
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rsi)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%rdx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%r8)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%r9)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rax)
-; AVX512DQ-FCP-NEXT: popq %rax
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i8_stride7_vf64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm25
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
-; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm18
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
-; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm24
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
-; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm9
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
-; AVX512BW-NEXT: vpermw %zmm25, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm10
+; AVX512BW-NEXT: vmovdqa (%rdi), %ymm7
; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm11
+; AVX512BW-NEXT: vmovdqa 96(%rdi), %ymm2
; AVX512BW-NEXT: movw $-28382, %ax # imm = 0x9122
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm3 {%k1}
+; AVX512BW-NEXT: vpblendmw %ymm1, %ymm7, %ymm0 {%k1}
; AVX512BW-NEXT: kmovq %k1, %k2
-; AVX512BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
-; AVX512BW-NEXT: vporq %xmm4, %xmm3, %xmm16
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512BW-NEXT: vporq %xmm3, %xmm0, %xmm17
+; AVX512BW-NEXT: movw $9288, %ax # imm = 0x2448
+; AVX512BW-NEXT: kmovd %eax, %k6
+; AVX512BW-NEXT: vpblendmw %ymm11, %ymm2, %ymm0 {%k6}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: movw $992, %ax # imm = 0x3E0
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1}
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11
+; AVX512BW-NEXT: vmovdqu16 %ymm0, %ymm17 {%k1}
+; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12
; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm6
; AVX512BW-NEXT: movw $8772, %ax # imm = 0x2244
-; AVX512BW-NEXT: kmovd %eax, %k6
-; AVX512BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6}
+; AVX512BW-NEXT: kmovd %eax, %k3
+; AVX512BW-NEXT: vpblendmw %ymm12, %ymm6, %ymm0 {%k3}
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u]
; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm7
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512BW-NEXT: vpshufb %xmm21, %xmm7, %xmm3
-; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm8
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm8
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm22 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512BW-NEXT: vpshufb %xmm22, %xmm8, %xmm3
+; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm9
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
-; AVX512BW-NEXT: vmovdqa64 240(%rdi), %xmm26
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovdqa 224(%rdi), %xmm4
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm5, %xmm12, %xmm5
+; AVX512BW-NEXT: vmovdqa64 240(%rdi), %xmm24
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm24[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqa64 224(%rdi), %xmm25
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm25[0,7,14],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm5, %xmm10, %xmm5
; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0
; AVX512BW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000
; AVX512BW-NEXT: kmovq %rax, %k5
-; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5}
-; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm13
-; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm12
-; AVX512BW-NEXT: movw $9288, %ax # imm = 0x2448
-; AVX512BW-NEXT: kmovd %eax, %k3
-; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3}
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k5}
+; AVX512BW-NEXT: vmovdqa 288(%rdi), %ymm15
+; AVX512BW-NEXT: vmovdqa 256(%rdi), %ymm14
+; AVX512BW-NEXT: vpblendmw %ymm15, %ymm14, %ymm0 {%k6}
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u]
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u]
-; AVX512BW-NEXT: vporq %xmm5, %xmm0, %xmm19
-; AVX512BW-NEXT: vmovdqa64 352(%rdi), %ymm17
+; AVX512BW-NEXT: vporq %xmm5, %xmm0, %xmm20
+; AVX512BW-NEXT: vmovdqa64 352(%rdi), %ymm18
; AVX512BW-NEXT: vmovdqa 320(%rdi), %ymm0
-; AVX512BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1]
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15]
+; AVX512BW-NEXT: vpblendmw %ymm18, %ymm0, %ymm5 {%k3}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm10 = ymm5[2,3,0,1]
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm10[2],ymm5[3,4,5],ymm10[6],ymm5[7,8,9],ymm10[10],ymm5[11,12,13],ymm10[14],ymm5[15]
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: movw $3968, %ax # imm = 0xF80
; AVX512BW-NEXT: kmovd %eax, %k7
-; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7}
-; AVX512BW-NEXT: vmovdqa 416(%rdi), %ymm15
+; AVX512BW-NEXT: vmovdqu16 %ymm5, %ymm20 {%k7}
+; AVX512BW-NEXT: vmovdqa64 416(%rdi), %ymm16
; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm5
; AVX512BW-NEXT: movw $4644, %ax # imm = 0x1224
; AVX512BW-NEXT: kmovd %eax, %k4
-; AVX512BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4}
-; AVX512BW-NEXT: vextracti32x4 $1, %ymm20, %xmm22
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
-; AVX512BW-NEXT: vporq %xmm22, %xmm20, %xmm20
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512BW-NEXT: vpblendmw %ymm16, %ymm5, %ymm10 {%k4}
+; AVX512BW-NEXT: vextracti32x4 $1, %ymm10, %xmm19
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm10[4,11],zero,zero
+; AVX512BW-NEXT: vporq %xmm19, %xmm10, %xmm10
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm10, %ymm0, %ymm21
; AVX512BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000
-; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm22 {%k4}
-; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm23
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vporq %xmm23, %xmm22, %xmm22
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendmw %ymm1, %ymm7, %ymm10 {%k4}
+; AVX512BW-NEXT: vextracti32x4 $1, %ymm10, %xmm19
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,8,15],zero,zero,xmm10[4,11],zero,zero,xmm10[u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vporq %xmm19, %xmm10, %xmm19
+; AVX512BW-NEXT: vpblendmw %ymm2, %ymm11, %ymm10 {%k2}
+; AVX512BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512BW-NEXT: vextracti128 $1, %ymm10, %xmm13
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4,5],ymm13[6],ymm10[7,8,9],ymm13[10],ymm10[11,12,13],ymm13[14],ymm10[15]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: movl $511, %edi # imm = 0x1FF
; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm22, %ymm9 {%k1}
-; AVX512BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u]
-; AVX512BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u]
-; AVX512BW-NEXT: vporq %xmm23, %xmm22, %xmm22
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vporq %xmm14, %xmm22, %xmm14
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k5}
-; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k6}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm2, %xmm14, %xmm2
+; AVX512BW-NEXT: vmovdqu8 %ymm19, %ymm10 {%k1}
+; AVX512BW-NEXT: vpblendmw %ymm12, %ymm6, %ymm13 {%k6}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm13[u,u,u,6,13],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[u,u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u],zero,zero,xmm13[4,11],zero,zero,xmm13[0,7,14,u,u,u,u]
+; AVX512BW-NEXT: vporq %xmm19, %xmm13, %xmm13
+; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm23[0],xmm19[0],xmm23[1],xmm19[1],xmm23[2],xmm19[2],xmm23[3],xmm19[3]
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm3
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,zero,xmm24[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm25[1,8,15],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vporq %xmm13, %xmm19, %xmm13
+; AVX512BW-NEXT: vinserti32x4 $2, %xmm13, %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm10 {%k5}
+; AVX512BW-NEXT: vpblendmw %ymm2, %ymm11, %ymm3 {%k4}
+; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm13
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm13[3],ymm3[4,5],ymm13[6],ymm3[7,8,9,10],ymm13[11],ymm3[12,13],ymm13[14],ymm3[15]
+; AVX512BW-NEXT: vpblendmw %ymm1, %ymm7, %ymm13 {%k3}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm13
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vporq %xmm19, %xmm13, %xmm13
; AVX512BW-NEXT: movl $261632, %edi # imm = 0x3FE00
; AVX512BW-NEXT: kmovd %edi, %k5
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2}
-; AVX512BW-NEXT: vextracti32x4 $1, %ymm14, %xmm22
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
-; AVX512BW-NEXT: vporq %xmm22, %xmm14, %xmm14
-; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT: vpshufb %xmm21, %xmm8, %xmm21
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vporq %xmm14, %xmm21, %xmm14
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm22
-; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm22 {%k1}
-; AVX512BW-NEXT: vpblendmw %ymm1, %ymm10, %ymm2 {%k3}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4}
-; AVX512BW-NEXT: vextracti32x4 $1, %ymm3, %xmm18
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm13 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendmw %ymm6, %ymm12, %ymm3 {%k2}
+; AVX512BW-NEXT: vextracti32x4 $1, %ymm3, %xmm19
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u]
+; AVX512BW-NEXT: vporq %xmm19, %xmm3, %xmm3
+; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT: vpshufb %xmm22, %xmm9, %xmm19
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3]
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm4
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm25[2,9],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vporq %xmm4, %xmm19, %xmm4
+; AVX512BW-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm22
+; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm22 {%k1}
+; AVX512BW-NEXT: vpblendmw %ymm2, %ymm11, %ymm3 {%k3}
+; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5,6],ymm4[7,8],ymm3[9,10],ymm4[11],ymm3[12,13,14],ymm4[15]
+; AVX512BW-NEXT: vpblendmw %ymm1, %ymm7, %ymm4 {%k6}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm4, %xmm13, %xmm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpblendmw %ymm6, %ymm12, %ymm3 {%k4}
+; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm13
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[2,9,u,u,u,u,u]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
-; AVX512BW-NEXT: vporq %xmm18, %xmm3, %xmm3
+; AVX512BW-NEXT: vpor %xmm3, %xmm13, %xmm3
; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512BW-NEXT: vporq %xmm18, %xmm21, %xmm18
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512BW-NEXT: vporq %xmm13, %xmm19, %xmm13
+; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
; AVX512BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000
; AVX512BW-NEXT: kmovd %edi, %k2
; AVX512BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vporq %xmm18, %xmm21, %xmm18
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18
-; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1}
+; AVX512BW-NEXT: vmovdqu8 %ymm13, %ymm3 {%k2}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm25[3,10],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vporq %xmm13, %xmm19, %xmm13
+; AVX512BW-NEXT: vinserti32x4 $2, %xmm13, %zmm3, %zmm19
+; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm19 {%k1}
; AVX512BW-NEXT: kmovd %eax, %k2
-; AVX512BW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2}
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2
+; AVX512BW-NEXT: vmovdqu8 %ymm21, %ymm20 {%k2}
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm3
; AVX512BW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1}
-; AVX512BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4}
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u]
-; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7}
-; AVX512BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
-; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
-; AVX512BW-NEXT: vpor %xmm3, %xmm14, %xmm3
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2}
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm9 {%k1}
-; AVX512BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6}
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u]
-; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7}
-; AVX512BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm17 {%k1}
+; AVX512BW-NEXT: vpblendmw %ymm14, %ymm15, %ymm3 {%k4}
+; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u]
+; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpblendmw %ymm18, %ymm0, %ymm4 {%k6}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm4[2,3,0,1]
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm13[3],ymm4[4,5],ymm13[6],ymm4[7,8,9,10],ymm13[11],ymm4[12,13],ymm13[14],ymm4[15]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7}
+; AVX512BW-NEXT: vpblendmw %ymm16, %ymm5, %ymm4 {%k3}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10]
+; AVX512BW-NEXT: vpor %xmm4, %xmm13, %xmm4
+; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2}
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm10 {%k1}
+; AVX512BW-NEXT: vpblendmw %ymm14, %ymm15, %ymm3 {%k3}
+; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u]
+; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpblendmw %ymm0, %ymm18, %ymm4 {%k4}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm4[2,3,0,1]
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm4[1,2],ymm13[3],ymm4[4,5,6],ymm13[7,8],ymm4[9,10],ymm13[11],ymm4[12,13,14],ymm13[15]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7}
+; AVX512BW-NEXT: vpblendmw %ymm16, %ymm5, %ymm4 {%k6}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11]
+; AVX512BW-NEXT: vpor %xmm4, %xmm13, %xmm4
+; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2}
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm22 {%k1}
+; AVX512BW-NEXT: vpblendmw %ymm14, %ymm15, %ymm3 {%k6}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u]
; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
-; AVX512BW-NEXT: vpor %xmm3, %xmm14, %xmm3
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2}
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm22 {%k1}
-; AVX512BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
-; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7}
-; AVX512BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4}
-; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm14
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
-; AVX512BW-NEXT: vpor %xmm3, %xmm14, %xmm3
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2}
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u]
+; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpblendmw %ymm0, %ymm18, %ymm4 {%k3}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm4[2,3,0,1]
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm13[0],ymm4[1,2,3],ymm13[4],ymm4[5,6],ymm13[7,8],ymm4[9,10,11],ymm13[12],ymm4[13,14],ymm13[15]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7}
+; AVX512BW-NEXT: vpblendmw %ymm5, %ymm16, %ymm4 {%k4}
+; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm13
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u],zero,zero,xmm13[2,9],zero,zero,zero,xmm13[5,12]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero
+; AVX512BW-NEXT: vpor %xmm4, %xmm13, %xmm4
+; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2}
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm19 {%k1}
; AVX512BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1}
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
-; AVX512BW-NEXT: vporq %xmm3, %xmm2, %xmm19
-; AVX512BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3}
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7}
-; AVX512BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4}
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
-; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1}
+; AVX512BW-NEXT: vpblendmw %ymm15, %ymm14, %ymm3 {%k1}
+; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
+; AVX512BW-NEXT: vporq %xmm4, %xmm3, %xmm20
+; AVX512BW-NEXT: vpblendmw %ymm0, %ymm18, %ymm3 {%k6}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqu16 %ymm3, %ymm20 {%k7}
+; AVX512BW-NEXT: vpblendmw %ymm15, %ymm14, %ymm3 {%k4}
+; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
+; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vpblendmw %ymm18, %ymm0, %ymm4 {%k1}
; AVX512BW-NEXT: kmovq %k1, %k7
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm13 = ymm4[2,3,0,1]
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4],ymm13[5],ymm4[6,7,8],ymm13[9],ymm4[10,11,12],ymm13[13],ymm4[14,15]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm21 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: movl $8176, %eax # imm = 0x1FF0
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1}
-; AVX512BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6}
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
-; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2}
-; AVX512BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
-; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2}
-; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm2 {%k6}
-; AVX512BW-NEXT: vpblendmw %ymm10, %ymm1, %ymm21 {%k6}
-; AVX512BW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4}
+; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm21 {%k1}
+; AVX512BW-NEXT: vpblendmw %ymm5, %ymm16, %ymm3 {%k3}
+; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero
+; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm20 {%k2}
+; AVX512BW-NEXT: vpblendmw %ymm5, %ymm16, %ymm3 {%k6}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero
+; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14]
+; AVX512BW-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm21 {%k2}
+; AVX512BW-NEXT: vmovdqu16 %ymm18, %ymm0 {%k4}
; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
; AVX512BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
-; AVX512BW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u]
-; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm12
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u],zero,zero,xmm12[0,7,14],zero,zero,xmm12[3,10,u,u,u]
-; AVX512BW-NEXT: vpor %xmm0, %xmm12, %xmm0
+; AVX512BW-NEXT: vmovdqu16 %ymm15, %ymm14 {%k3}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u,u,2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
+; AVX512BW-NEXT: vpor %xmm0, %xmm4, %xmm0
; AVX512BW-NEXT: movl $4186112, %eax # imm = 0x3FE000
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7}
-; AVX512BW-NEXT: vpblendmw %ymm10, %ymm1, %ymm12 {%k4}
-; AVX512BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3}
-; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm1 {%k3}
-; AVX512BW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u]
-; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm2, %xmm10, %xmm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10
-; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
-; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm2 {%k1}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqu16 %ymm16, %ymm5 {%k7}
+; AVX512BW-NEXT: vpblendmw %ymm2, %ymm11, %ymm13 {%k6}
+; AVX512BW-NEXT: vpblendmw %ymm6, %ymm12, %ymm3 {%k3}
+; AVX512BW-NEXT: vpblendmw %ymm11, %ymm2, %ymm15 {%k4}
+; AVX512BW-NEXT: vpblendmw %ymm7, %ymm1, %ymm14 {%k3}
+; AVX512BW-NEXT: vmovdqu16 %ymm11, %ymm2 {%k3}
+; AVX512BW-NEXT: vpblendmw %ymm7, %ymm1, %ymm4 {%k4}
+; AVX512BW-NEXT: vpblendmw %ymm6, %ymm12, %ymm11 {%k6}
+; AVX512BW-NEXT: vmovdqu16 %ymm7, %ymm1 {%k6}
+; AVX512BW-NEXT: vmovdqu16 %ymm12, %ymm6 {%k4}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u]
; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm3, %xmm10, %xmm3
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm7, %xmm3, %xmm3
; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
-; AVX512BW-NEXT: vpor %xmm10, %xmm11, %xmm10
-; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT: vmovdqu8 %ymm10, %ymm3 {%k1}
-; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm10
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm6, %xmm10, %xmm6
-; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
-; AVX512BW-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512BW-NEXT: vpor %xmm7, %xmm12, %xmm7
; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1}
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
-; AVX512BW-NEXT: vpermw %zmm25, %zmm7, %zmm7
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
-; AVX512BW-NEXT: vpermw %zmm25, %zmm8, %zmm8
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
-; AVX512BW-NEXT: vpermw %zmm25, %zmm10, %zmm10
-; AVX512BW-NEXT: vextracti128 $1, %ymm12, %xmm11
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,11],zero,zero,xmm12[0,7,14],zero,zero,xmm12[u,u,u,u,u,u,u]
+; AVX512BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm3 {%k1}
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm11, %xmm11
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm7, %xmm11, %xmm7
+; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14]
; AVX512BW-NEXT: vpor %xmm11, %xmm12, %xmm11
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm10[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm4, %xmm12
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm11 {%k5}
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2
+; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT: vmovdqu8 %ymm11, %ymm7 {%k1}
+; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm11
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm6, %xmm11, %xmm6
+; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15]
+; AVX512BW-NEXT: vpor %xmm8, %xmm9, %xmm8
+; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm6 {%k1}
+; AVX512BW-NEXT: vextracti128 $1, %ymm13, %xmm8
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm13[1,2,3],ymm8[4],ymm13[5,6],ymm8[7,8],ymm13[9,10,11],ymm8[12],ymm13[13,14],ymm8[15]
+; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm9
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm4, %xmm9, %xmm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm25, %xmm9
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm24[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k5}
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm3
; AVX512BW-NEXT: movw $-512, %ax # imm = 0xFE00
-; AVX512BW-NEXT: vextracti32x4 $1, %ymm21, %xmm12
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm12, %xmm13, %xmm12
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm12 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3
-; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm12 {%k5}
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vextracti128 $1, %ymm15, %xmm9
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm9 = ymm15[0],ymm9[1],ymm15[2,3],ymm9[4],ymm15[5,6,7,8],ymm9[9],ymm15[10,11],ymm9[12],ymm15[13,14,15]
+; AVX512BW-NEXT: vextracti128 $1, %ymm14, %xmm11
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[3,10],zero,zero,zero,xmm11[6,13,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpor %xmm11, %xmm12, %xmm11
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm11 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm24[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm25[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7
+; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm11 {%k5}
+; AVX512BW-NEXT: vextracti128 $1, %ymm2, %xmm7
+; AVX512BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4],ymm7[5],ymm2[6,7,8],ymm7[9],ymm2[10,11,12],ymm7[13],ymm2[14,15]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpor %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm26, %xmm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3
-; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5}
+; AVX512BW-NEXT: vpor %xmm7, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm24, %xmm2
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm25[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2
+; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k5}
; AVX512BW-NEXT: kmovd %eax, %k1
+; AVX512BW-NEXT: vmovdqa32 %zmm3, %zmm4 {%k1}
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm2
; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm11 {%k1}
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm2
-; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm12 {%k1}
; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm2
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
@@ -13708,123 +13752,111 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rsi)
-; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx)
+; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rsi)
+; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx)
; AVX512BW-NEXT: vmovdqa64 %zmm22, (%rcx)
-; AVX512BW-NEXT: vmovdqa64 %zmm18, (%r8)
-; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r9)
-; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rdi)
+; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8)
+; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r9)
+; AVX512BW-NEXT: vmovdqa64 %zmm11, (%rdi)
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i8_stride7_vf64:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1
; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
-; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
-; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
-; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm8
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
-; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm4
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm5
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm10
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm0 {%k1}
; AVX512BW-FCP-NEXT: kmovq %k1, %k2
; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,17,10,3,20,13,6,0,24,0,0,27,0,0,0,31]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm1 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9
; AVX512BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm5 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm4 {%k1}
; AVX512BW-FCP-NEXT: kmovq %k1, %k3
; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6]
-; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
-; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm6, %ymm6
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
-; AVX512BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm5, %zmm5
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6]
+; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm12
+; AVX512BW-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-FCP-NEXT: vmovdqa 240(%rdi), %xmm13
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm16
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm16[0,7,14],zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4
; AVX512BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000
; AVX512BW-FCP-NEXT: kmovq %rax, %k5
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm1 {%k5}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm0 {%k5}
; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6
; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm5
; AVX512BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448
; AVX512BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm7 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm7, %xmm21
+; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm4 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm7, %xmm4, %xmm17
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,1,0,0,4,0,0,0,8,25,18,11,0,29,22,15]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80
; AVX512BW-FCP-NEXT: kmovd %eax, %k7
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm21 {%k7}
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm17 {%k7}
; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm7
; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm4
; AVX512BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224
; AVX512BW-FCP-NEXT: kmovd %eax, %k4
-; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm18 {%k4}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero
-; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22
-; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000
-; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm8 {%k4}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero
+; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm8, %xmm18
+; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm8 {%k4}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm8, %xmm14
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,17,10,3,0,21,14,7,24,0,0,0,28,0,0,31]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm8, %zmm8
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF
-; AVX512BW-FCP-NEXT: kmovd %r10d, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm8 {%k1}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u]
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6]
-; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512BW-FCP-NEXT: movl $511, %eax # imm = 0x1FF
+; AVX512BW-FCP-NEXT: kmovd %eax, %k1
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm8 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm14 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u,u,u]
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,4,6]
+; AVX512BW-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k5}
; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3}
@@ -13832,144 +13864,156 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00
-; AVX512BW-FCP-NEXT: kmovd %r10d, %k5
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6]
-; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,18,11,4,21,14,7,0,25,0,0,28,0,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm15, %zmm15
+; AVX512BW-FCP-NEXT: movl $261632, %eax # imm = 0x3FE00
+; AVX512BW-FCP-NEXT: kmovd %eax, %k5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm15 {%k2}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm19
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm15, %xmm15
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,1,3,5,6]
+; AVX512BW-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm12
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm16[2,9],zero,zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm13[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1}
; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
+; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm14, %xmm19
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [8,1,18,11,4,0,22,15,0,25,0,0,0,29,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm14, %zmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm19 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm14 {%k4}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12]
-; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero,xmm14[u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm14, %xmm20
+; AVX512BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
+; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm15
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm22, %xmm21
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
; AVX512BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000
; AVX512BW-FCP-NEXT: kmovd %r10d, %k2
; AVX512BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm20 {%k2}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[3,10],zero,zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm20, %zmm13
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm13 {%k1}
; AVX512BW-FCP-NEXT: kmovd %eax, %k3
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm17 {%k3}
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm16
; AVX512BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
; AVX512BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k4}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm0 {%k2}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm16 {%k4}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero,xmm16[u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [0,1,0,0,0,5,0,0,8,0,26,19,12,29,22,15]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm17, %zmm17
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm16 {%k7}
; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10]
-; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k2}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k1}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm15 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11]
-; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm14 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm15 {%k4}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero
-; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2}
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16
+; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm17 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm17[5,12],zero,zero
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14],zero,zero,xmm17[3,10]
+; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm16 {%k3}
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm16
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm8 {%k2}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm16 {%k1}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[6,13,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [0,0,2,0,0,5,0,0,16,9,26,19,12,0,30,23]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm17, %zmm17
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm16 {%k7}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm4, %ymm17 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11]
+; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm16 {%k3}
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm16
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm12 {%k2}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm16 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[u,u]
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [0,0,2,0,0,0,6,0,16,9,0,27,20,13,30,23]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm17, %zmm17
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm16 {%k7}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm17 {%k4}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm18
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[5,12]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,4,11],zero,zero,xmm17[0,7,14],zero,zero
+; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm16 {%k3}
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm16
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm13 {%k2}
; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload
-; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm14 {%k2}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm16 {%k4}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm16 {%k2}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [0,0,0,3,0,0,6,0,24,17,10,27,20,13,0,31]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm17, %zmm17
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm16 {%k7}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm17 {%k4}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm18
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,1,8,15],zero,zero,xmm17[4,11],zero,zero,xmm17[u,u,u]
+; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm18
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [0,0,0,3,0,0,0,7,24,17,10,0,28,21,14,31]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm17, %zmm17
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm17 {%k1}
; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k1}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19
+; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm18 {%k1}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm19
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero
-; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm16 {%k6}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14]
-; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,5,12],zero,zero,xmm18[1,8,15],zero,zero
+; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm16 {%k3}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm7, %ymm18 {%k6}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm18[u,u,u,u,u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14]
+; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm18, %xmm18
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm17 {%k3}
; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4}
-; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1}
+; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm18 {%k1}
; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6}
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6}
; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1}
@@ -13979,8 +14023,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
; AVX512BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload
@@ -13990,8 +14034,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20
; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k3}
@@ -13999,82 +14043,82 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u]
; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9
+; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm20
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
+; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm14, %xmm9
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15]
-; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
-; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm9 {%k3}
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm17, %zmm17
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm18
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm20, %zmm20
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0]
-; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20
-; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm19, %ymm19
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm2 {%k5}
-; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm10
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm20 {%k3}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm9
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm14, %xmm9
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [8,1,0,19,12,5,22,15,0,0,26,0,0,29,0,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm14, %zmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,2,4,6,0,0,0,0]
+; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm15
+; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm14, %ymm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm10, %zmm10
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm9 {%k5}
+; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm10
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vporq %xmm10, %xmm16, %xmm10
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm16, %ymm16
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm16, %zmm11, %zmm11
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm14, %xmm10
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,9,2,19,12,5,0,23,0,0,26,0,0,0,30,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm14, %zmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,4,6,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm14, %ymm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm11, %zmm11
; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k5}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0]
-; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm11, %ymm11
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9
-; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm3 {%k5}
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
-; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm0
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [16,9,2,0,20,13,6,23,0,0,0,27,0,0,30,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm2
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,6,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm20, %zmm2
+; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm3 {%k5}
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[u,u,u,u,2,9],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[u,u,u]
; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,10,u,u,u]
-; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
+; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm5, %xmm2
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,0,0,0,4,0,0,7,0,25,18,11,28,21,14,0]
+; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1
; AVX512BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm1
; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm4 {%k2}
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm10 {%k1}
-; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
+; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm9 {%k1}
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm10 {%k1}
+; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm1
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,0,7,14],zero,zero,xmm4[3,10],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm4, %xmm0
-; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7],ymm5[8,9,10],ymm0[11,12,13,14,15]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1}
+; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa32 %zmm1, %zmm3 {%k1}
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rsi)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%r9)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdi)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
@@ -14082,366 +14126,374 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-BW-LABEL: load_i8_stride7_vf64:
; AVX512DQ-BW: # %bb.0:
-; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm25
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,0,25,10,0,12,29,14,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm18
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,0,28,13,0,15]
-; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm24
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,0,11,28,13,0,31]
-; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm10
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,0,27,28,0,30,31]
-; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm0, %zmm0
-; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm9
+; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm7
; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %ymm2
; AVX512DQ-BW-NEXT: movw $-28382, %ax # imm = 0x9122
; AVX512DQ-BW-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm3 {%k1}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm7, %ymm0 {%k1}
; AVX512DQ-BW-NEXT: kmovq %k1, %k2
-; AVX512DQ-BW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm4, %xmm3, %xmm16
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm3, %xmm0, %xmm17
+; AVX512DQ-BW-NEXT: movw $9288, %ax # imm = 0x2448
+; AVX512DQ-BW-NEXT: kmovd %eax, %k6
+; AVX512DQ-BW-NEXT: vpblendmw %ymm10, %ymm2, %ymm0 {%k6}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: movw $992, %ax # imm = 0x3E0
; AVX512DQ-BW-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm0, %ymm16 {%k1}
-; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm11
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm0, %ymm17 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm12
; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm6
; AVX512DQ-BW-NEXT: movw $8772, %ax # imm = 0x2244
-; AVX512DQ-BW-NEXT: kmovd %eax, %k6
-; AVX512DQ-BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm0 {%k6}
+; AVX512DQ-BW-NEXT: kmovd %eax, %k3
+; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm6, %ymm0 {%k3}
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm3
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u,u,u]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,5,12],zero,zero,xmm0[1,8,15],zero,zero,xmm0[u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm0, %xmm0
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm7
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm21 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm7, %xmm3
-; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm8
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm8
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm22 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm8, %xmm3
+; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm9
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm9[u,u,u,u,u,u,2,9,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-BW-NEXT: vmovdqa64 240(%rdi), %xmm26
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm26[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %xmm4
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm12, %xmm5
+; AVX512DQ-BW-NEXT: vmovdqa64 240(%rdi), %xmm24
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm24[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqa64 224(%rdi), %xmm25
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm25[0,7,14],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm5, %xmm11, %xmm5
; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm0
; AVX512DQ-BW-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000
; AVX512DQ-BW-NEXT: kmovq %rax, %k5
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm16 {%k5}
-; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm13
-; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm12
-; AVX512DQ-BW-NEXT: movw $9288, %ax # imm = 0x2448
-; AVX512DQ-BW-NEXT: kmovd %eax, %k3
-; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm0 {%k3}
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm17 {%k5}
+; AVX512DQ-BW-NEXT: vmovdqa 288(%rdi), %ymm14
+; AVX512DQ-BW-NEXT: vmovdqa 256(%rdi), %ymm13
+; AVX512DQ-BW-NEXT: vpblendmw %ymm14, %ymm13, %ymm0 {%k6}
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm5 = xmm0[u,u,u,u,u,3,10],zero,zero,zero,xmm0[6,13],zero,zero,xmm0[u,u]
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u],zero,zero,xmm0[1,8,15],zero,zero,xmm0[4,11,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm5, %xmm0, %xmm19
-; AVX512DQ-BW-NEXT: vmovdqa64 352(%rdi), %ymm17
+; AVX512DQ-BW-NEXT: vporq %xmm5, %xmm0, %xmm20
+; AVX512DQ-BW-NEXT: vmovdqa64 352(%rdi), %ymm18
; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %ymm0
-; AVX512DQ-BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm5 {%k6}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm5[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7,8,9],ymm15[10],ymm5[11,12,13],ymm15[14],ymm5[15]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm18, %ymm0, %ymm5 {%k3}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm11 = ymm5[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm11[2],ymm5[3,4,5],ymm11[6],ymm5[7,8,9],ymm11[10],ymm5[11,12,13],ymm11[14],ymm5[15]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: movw $3968, %ax # imm = 0xF80
; AVX512DQ-BW-NEXT: kmovd %eax, %k7
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm5, %ymm19 {%k7}
-; AVX512DQ-BW-NEXT: vmovdqa 416(%rdi), %ymm15
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm5, %ymm20 {%k7}
+; AVX512DQ-BW-NEXT: vmovdqa64 416(%rdi), %ymm16
; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm5
; AVX512DQ-BW-NEXT: movw $4644, %ax # imm = 0x1224
; AVX512DQ-BW-NEXT: kmovd %eax, %k4
-; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm20 {%k4}
-; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm20, %xmm22
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm20[4,11],zero,zero
-; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm20, %xmm20
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
+; AVX512DQ-BW-NEXT: vpblendmw %ymm16, %ymm5, %ymm11 {%k4}
+; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm11, %xmm19
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm11[4,11],zero,zero
+; AVX512DQ-BW-NEXT: vporq %xmm19, %xmm11, %xmm11
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm11, %ymm0, %ymm21
; AVX512DQ-BW-NEXT: movl $-8388608, %eax # imm = 0xFF800000
-; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm22 {%k4}
-; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm23
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[1,8,15],zero,zero,xmm22[4,11],zero,zero,xmm22[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm23, %xmm22, %xmm22
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm7, %ymm11 {%k4}
+; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm11, %xmm19
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[1,8,15],zero,zero,xmm11[4,11],zero,zero,xmm11[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm19, %xmm11, %xmm19
+; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm10, %ymm11 {%k2}
+; AVX512DQ-BW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm11, %xmm15
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4,5],ymm15[6],ymm11[7,8,9],ymm15[10],ymm11[11,12,13],ymm15[14],ymm11[15]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: movl $511, %edi # imm = 0x1FF
; AVX512DQ-BW-NEXT: kmovd %edi, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm22, %ymm10 {%k1}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm11, %ymm6, %ymm22 {%k3}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm22[u,u,u,6,13],zero,zero,xmm22[2,9],zero,zero,zero,xmm22[u,u,u,u]
-; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm22
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u],zero,zero,xmm22[4,11],zero,zero,xmm22[0,7,14,u,u,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm23, %xmm22, %xmm22
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm7[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3]
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm26[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm22, %xmm14
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm14, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k5}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k6}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm2[2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm14, %xmm2
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm19, %ymm11 {%k1}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm6, %ymm15 {%k6}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm15[u,u,u,6,13],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[u,u,u,u]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,xmm15[4,11],zero,zero,xmm15[0,7,14,u,u,u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm19, %xmm15, %xmm15
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm9[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm23 = xmm8[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm23[0],xmm19[0],xmm23[1],xmm19[1],xmm23[2],xmm19[2],xmm23[3],xmm19[3]
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm24[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm25[1,8,15],zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm19, %xmm15
+; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm15, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm11 {%k5}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm10, %ymm3 {%k4}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm15
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8,9,10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm7, %ymm15 {%k3}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm15[2,9],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm19, %xmm15, %xmm15
; AVX512DQ-BW-NEXT: movl $261632, %edi # imm = 0x3FE00
; AVX512DQ-BW-NEXT: kmovd %edi, %k5
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm14 {%k2}
-; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm14, %xmm22
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[1,8,15,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm14, %xmm14
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm8, %xmm21
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm7[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm21 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3]
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm4[2,9],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm14, %xmm21, %xmm14
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm14, %zmm3, %zmm23
-; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm23 {%k1}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm9, %ymm2 {%k3}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k4}
-; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm3, %xmm18
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm15 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm12, %ymm3 {%k2}
+; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm3, %xmm19
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u,u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm19, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm9, %xmm19
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm22 = xmm8[u,u,u,u,u,u,6,13,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3]
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm25[2,9],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm4, %xmm19, %xmm4
+; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm4, %zmm3, %zmm22
+; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm22 {%k1}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm10, %ymm3 {%k3}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5,6],ymm4[7,8],ymm3[9,10],ymm4[11],ymm3[12,13,14],ymm4[15]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm1, %ymm7, %ymm4 {%k6}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[3,10],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm15, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm12, %ymm3 {%k4}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm15, %xmm3
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[5,12]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = xmm7[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm21, %xmm18
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[5,12]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm19 = xmm8[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm19, %xmm15
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512DQ-BW-NEXT: movl $-134217728, %edi # imm = 0xF8000000
; AVX512DQ-BW-NEXT: kmovd %edi, %k2
; AVX512DQ-BW-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm18, %ymm3 {%k2}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm18 = xmm4[3,10],zero,zero,zero,xmm4[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm26[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm18, %xmm21, %xmm18
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm18, %zmm3, %zmm18
-; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm18 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm15, %ymm3 {%k2}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm25[3,10],zero,zero,zero,xmm25[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm24[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm15, %xmm19, %xmm15
+; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm15, %zmm3, %zmm19
+; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm19 {%k1}
; AVX512DQ-BW-NEXT: kmovd %eax, %k2
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm20, %ymm19 {%k2}
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm2
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm21, %ymm20 {%k2}
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm20, %zmm0, %zmm3
; AVX512DQ-BW-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
; AVX512DQ-BW-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm16 {%k1}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k4}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k3}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7,8,9,10],ymm14[11],ymm3[12,13],ymm14[14],ymm3[15]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k6}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10]
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm14, %xmm3
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2}
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm10 {%k1}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k6}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k4}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2],ymm14[3],ymm3[4,5,6],ymm14[7,8],ymm3[9,10],ymm14[11],ymm3[12,13,14],ymm14[15]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm15, %ymm5, %ymm3 {%k3}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm17 {%k1}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm14, %ymm3 {%k4}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero,xmm3[u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vpblendmw %ymm18, %ymm0, %ymm4 {%k6}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm4[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm15[3],ymm4[4,5],ymm15[6],ymm4[7,8,9,10],ymm15[11],ymm4[12,13],ymm15[14],ymm4[15]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm16, %ymm5, %ymm4 {%k3}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10]
+; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm15, %xmm4
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2}
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm11 {%k1}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm14, %ymm3 {%k3}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm18, %ymm4 {%k4}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm4[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1,2],ymm15[3],ymm4[4,5,6],ymm15[7,8],ymm4[9,10],ymm15[11],ymm4[12,13,14],ymm15[15]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm16, %ymm5, %ymm4 {%k6}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm4[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm4[6,13],zero,zero
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[1,8,15],zero,zero,xmm4[4,11]
+; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm15, %xmm4
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2}
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm22 {%k1}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm14, %ymm3 {%k6}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u]
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11]
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm14, %xmm3
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2}
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm23 {%k1}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm12, %ymm13, %ymm2 {%k3}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm3 {%k6}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1,2,3],ymm14[4],ymm3[5,6],ymm14[7,8],ymm3[9,10,11],ymm14[12],ymm3[13,14],ymm14[15]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm2 {%k7}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm3 {%k4}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[5,12]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,4,11],zero,zero,xmm3[0,7,14],zero,zero
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm14, %xmm3
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k2}
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm18 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm18, %ymm4 {%k3}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm4[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1,2,3],ymm15[4],ymm4[5,6],ymm15[7,8],ymm4[9,10,11],ymm15[12],ymm4[13,14],ymm15[15]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm4, %ymm3 {%k7}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm16, %ymm4 {%k4}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,4,11],zero,zero,xmm4[0,7,14],zero,zero
+; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm15, %xmm4
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm4, %ymm3 {%k2}
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm19 {%k1}
; AVX512DQ-BW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k1}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u]
-; AVX512DQ-BW-NEXT: vporq %xmm3, %xmm2, %xmm19
-; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm17, %ymm2 {%k3}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6,7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14,15]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm2, %ymm19 {%k7}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm13, %ymm12, %ymm2 {%k4}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT: vpblendmw %ymm17, %ymm0, %ymm3 {%k1}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm14, %ymm13, %ymm3 {%k1}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,0,7,14],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[u,u]
+; AVX512DQ-BW-NEXT: vporq %xmm4, %xmm3, %xmm20
+; AVX512DQ-BW-NEXT: vpblendmw %ymm0, %ymm18, %ymm3 {%k6}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm3, %ymm20 {%k7}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm14, %ymm13, %ymm3 {%k4}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,9,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vpblendmw %ymm18, %ymm0, %ymm4 {%k1}
; AVX512DQ-BW-NEXT: kmovq %k1, %k7
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm14 = ymm3[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4],ymm14[5],ymm3[6,7,8],ymm14[9],ymm3[10,11,12],ymm14[13],ymm3[14,15]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm20 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm4[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3,4],ymm15[5],ymm4[6,7,8],ymm15[9],ymm4[10,11,12],ymm15[13],ymm4[14,15]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm21 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: movl $8176, %eax # imm = 0x1FF0
; AVX512DQ-BW-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k1}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k6}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[3,10],zero,zero,zero,xmm3[6,13]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm19 {%k2}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm15, %ymm2 {%k3}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14]
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm2, %ymm20 {%k2}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm22 {%k6}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm1, %ymm21 {%k6}
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm17, %ymm0 {%k4}
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7,8,9],ymm2[10],ymm0[11,12],ymm2[13],ymm0[14,15]
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm13, %ymm12 {%k6}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm12[u,u,u,u,2,9],zero,zero,zero,xmm12[5,12],zero,zero,xmm12[u,u,u]
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm12, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm21 {%k1}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm16, %ymm3 {%k3}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,5,12],zero,zero,xmm3[1,8,15],zero,zero
+; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm20 {%k2}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm5, %ymm16, %ymm3 {%k6}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm3[u,u,u,u,u,u,u,6,13],zero,zero,xmm3[2,9],zero,zero,zero
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u],zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14]
+; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm21 {%k2}
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm18, %ymm0 {%k4}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm14, %ymm13 {%k3}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm13[u,u,u,u,2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm4, %xmm0
; AVX512DQ-BW-NEXT: movl $4186112, %eax # imm = 0x3FE000
; AVX512DQ-BW-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm15, %ymm5 {%k7}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm9, %ymm1, %ymm2 {%k4}
-; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm11, %ymm3 {%k3}
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm9, %ymm1 {%k3}
-; AVX512DQ-BW-NEXT: vmovdqu16 %ymm11, %ymm6 {%k4}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm22[u,u,2,9],zero,zero,zero,xmm22[5,12],zero,zero,xmm22[u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm22, %xmm11
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[0,7,14],zero,zero,xmm11[3,10,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm11, %xmm9
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm8[6,13]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm7[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
-; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm12, %xmm11
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm9 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,u,3,10],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm16, %ymm5 {%k7}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm2, %ymm10, %ymm15 {%k6}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm12, %ymm3 {%k3}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm10, %ymm2, %ymm14 {%k4}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm1, %ymm13 {%k3}
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm10, %ymm2 {%k3}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm7, %ymm1, %ymm4 {%k4}
+; AVX512DQ-BW-NEXT: vpblendmw %ymm6, %ymm12, %ymm10 {%k6}
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm7, %ymm1 {%k6}
+; AVX512DQ-BW-NEXT: vmovdqu16 %ymm12, %ymm6 {%k4}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm3[u,u,2,9],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[u,u,u,u,u]
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[1,8,15],zero,zero,xmm3[4,11,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm11, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm3, %xmm3
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[0,7,14]
-; AVX512DQ-BW-NEXT: vpor %xmm11, %xmm12, %xmm11
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm11, %ymm3 {%k1}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm11
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm9[6,13]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm12, %xmm7
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-BW-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 4-byte Reload
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm3 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm10[u,u,3,10],zero,zero,zero,xmm10[6,13],zero,zero,xmm10[u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm10, %xmm10
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[1,8,15],zero,zero,xmm10[4,11,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm10, %xmm7
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[0,7,14]
+; AVX512DQ-BW-NEXT: vpor %xmm10, %xmm12, %xmm10
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm10, %ymm7 {%k1}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm10
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[2,9],zero,zero,zero,xmm10[5,12,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,4,11],zero,zero,xmm6[0,7,14],zero,zero,xmm6[u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm11, %xmm6
+; AVX512DQ-BW-NEXT: vpor %xmm6, %xmm10, %xmm6
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm8[1,8,15]
-; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm8, %xmm7
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm6 {%k1}
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,0,26,27,28,0,30,31]
-; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm7, %zmm7
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,0,26,27,0,29,30,31]
-; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm8, %zmm8
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,0,9,26,11,0,29,14,0]
-; AVX512DQ-BW-NEXT: vpermw %zmm25, %zmm11, %zmm11
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm12
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm12, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm11[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
-; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm4, %xmm12
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm13 = xmm26[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm9
-; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm2 {%k5}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm9[1,8,15]
+; AVX512DQ-BW-NEXT: vpor %xmm8, %xmm9, %xmm8
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm8, %ymm6 {%k1}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm15, %xmm8
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0],ymm15[1,2,3],ymm8[4],ymm15[5,6],ymm8[7,8],ymm15[9,10,11],ymm8[12],ymm15[13,14],ymm8[15]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm9
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,11],zero,zero,xmm4[0,7,14],zero,zero,xmm4[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm4, %xmm9, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm25, %xmm9
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm24[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm4 {%k5}
; AVX512DQ-BW-NEXT: movw $-512, %ax # imm = 0xFE00
-; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm21, %xmm9
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm14, %xmm3
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13,14,15]
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm13, %xmm9
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm21[5,12],zero,zero,xmm21[1,8,15],zero,zero,xmm21[u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm12, %xmm9
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm8[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm8 = xmm26[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm12 = xmm4[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm3, %zmm3
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm13[5,12],zero,zero,xmm13[1,8,15],zero,zero,xmm13[u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpor %xmm9, %xmm10, %xmm9
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm3[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm24[3,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm10 = xmm25[5,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1],xmm10[2],xmm3[2],xmm10[3],xmm3[3]
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm3
; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm9 {%k5}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm1, %xmm1
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm7[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm26, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm6, %zmm3
-; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm1 {%k5}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 {%k5} = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm24, %xmm2
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm25[6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm6, %zmm2
+; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k5}
; AVX512DQ-BW-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm9 {%k1}
-; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u],zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm3[3,4,5,6,7],ymm0[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm20, %zmm0, %zmm4 {%k1}
+; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm21, %zmm0, %zmm9 {%k1}
+; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm2
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
+; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7],ymm0[8,9,10],ymm2[11,12,13,14,15]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-BW-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rsi)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%rcx)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, (%r8)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rsi)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%rdx)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, (%rcx)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%r8)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r9)
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdi)
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax)
; AVX512DQ-BW-NEXT: vzeroupper
@@ -14449,39 +14501,25 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf64:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm8
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm5
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm10
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
; AVX512DQ-BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm0 {%k1}
; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k2
; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[5,12],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,17,10,3,20,13,6,0,24,0,0,27,0,0,0,31]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm4, %zmm4
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm0 {%k1}
; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9
; AVX512DQ-BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244
@@ -14489,71 +14527,75 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm4 {%k1}
; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k3
; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u],zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,1,2,4,6]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm6, %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm7, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 240(%rdi), %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm13[5,12,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm16
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm16[0,7,14],zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4
; AVX512DQ-BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm0 {%k5}
; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm6
; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4
; AVX512DQ-BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm7 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm7[u,u,u,u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u],zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm7, %xmm21
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm5 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[u,u,u,u,u,3,10],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u],zero,zero,xmm5[1,8,15],zero,zero,xmm5[4,11,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm7, %xmm5, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,0,0,4,0,0,0,8,25,18,11,0,29,22,15]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k7
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm21 {%k7}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm17 {%k7}
; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm7
; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm5
; AVX512DQ-BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm18 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm8 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm8[4,11],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm8, %ymm0, %ymm18
; AVX512DQ-BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm8 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm8, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,17,10,3,0,21,14,7,24,0,0,0,28,0,0,31]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm8, %zmm8
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF
; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm8 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm14 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,4,6]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm13[6,13,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k5}
; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3}
@@ -14561,144 +14603,154 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm15 = [0,0,18,11,4,21,14,7,0,25,0,0,28,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm15, %zmm15
; AVX512DQ-BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00
; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k5
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm15[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm15 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm19
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,0,7,14],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,1,3,5,6]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm16[2,9],zero,zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm13[0,7,14,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1}
; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6}
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm14, %xmm19
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [8,1,18,11,4,0,22,15,0,25,0,0,0,29,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm14, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm19 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm14 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,1,8,15],zero,zero,xmm14[4,11],zero,zero,xmm14[u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm14, %ymm0, %ymm20
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm22, %xmm21
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21
; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000
; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2
; AVX512DQ-BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm20 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[3,10],zero,zero,zero,xmm16[u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm20, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm19, %zmm13 {%k1}
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm17 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm16
; AVX512DQ-BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm0 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm16 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[5,12,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero,xmm16[u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [0,1,0,0,0,5,0,0,8,0,26,19,12,29,22,15]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm17, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm16 {%k7}
; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm15 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm14 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm15 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm17 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm17[5,12],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14],zero,zero,xmm17[3,10]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm16 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm8 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[6,13,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [0,0,2,0,0,5,0,0,16,9,26,19,12,0,30,23]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm17, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm16 {%k7}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm5, %ymm17 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm16 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm12 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm6, %ymm16 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm16[u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[u,u]
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [0,0,2,0,0,0,6,0,16,9,0,27,20,13,30,23]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm17, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm16 {%k7}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm17 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm18
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[5,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,4,11],zero,zero,xmm17[0,7,14],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm16 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm0, %zmm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm13 {%k2}
; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm16 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[1,8,15,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,0,7,14],zero,zero,xmm16[3,10],zero,zero,zero,xmm16[u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm16, %xmm16
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [0,0,0,3,0,0,6,0,24,17,10,27,20,13,0,31]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm17, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm16 {%k7}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm4, %ymm17 {%k4}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm18
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u],zero,zero,zero,xmm18[6,13],zero,zero,xmm18[2,9,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,1,8,15],zero,zero,xmm17[4,11],zero,zero,xmm17[u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm18
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [0,0,0,3,0,0,0,7,24,17,10,0,28,21,14,31]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm17, %zmm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm17 {%k1}
; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm18 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm19
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm16 {%k6}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,5,12],zero,zero,xmm18[1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm16 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm7, %ymm18 {%k6}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm18[u,u,u,u,u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14]
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm18, %xmm18
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm17 {%k3}
; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4}
-; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm18 {%k1}
; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6}
; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1}
@@ -14708,8 +14760,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
; AVX512DQ-BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload
@@ -14719,8 +14771,8 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14]
; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k2}
@@ -14728,80 +14780,80 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm20
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm14, %xmm9
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm9 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm17, %zmm17
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm18, %zmm18
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm20 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm20, %zmm20
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [1,2,4,6,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm20
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm19, %ymm19
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm10, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm2 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm20 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm14, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [8,1,0,19,12,5,22,15,0,0,26,0,0,29,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm14, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,2,4,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm14, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm10, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm9 {%k5}
+; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm10
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm10, %xmm16, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,4,6,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm16, %ymm16
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm16, %zmm11, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm14, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,9,2,19,12,5,0,23,0,0,26,0,0,0,30,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm14, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 {%k5} = ymm14[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,4,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm14, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm11, %zmm11
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k5}
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm11, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,6,0,0,0,0]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm11, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm9, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm3 {%k5}
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm9, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [16,9,2,0,20,13,6,23,0,0,0,27,0,0,30,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm11, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,5,6,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm20, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm3 {%k5}
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm4 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u]
; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm4, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [0,0,0,0,4,0,0,7,0,25,18,11,28,21,14,0]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm1
; AVX512DQ-BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00
; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm5 {%k1}
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm15, %zmm0, %zmm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7],ymm4[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm3 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm16, %zmm0, %zmm9 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm10 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,u],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,u,u,u,u,0,7,14],zero,zero,xmm5[3,10],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm4, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3 {%k1}
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rsi)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%r9)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdi)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index 99932c0026b23..950d18719c6f2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -2726,108 +2726,108 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1
; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm5
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512-NEXT: vpmovqb %zmm5, %xmm6
-; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512-NEXT: vpmovqb %zmm4, %xmm5
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm4
+; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
+; AVX512-NEXT: vpmovqb %zmm2, %xmm5
+; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm6
+; AVX512-NEXT: vpshufb %xmm5, %xmm6, %xmm7
+; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512-NEXT: vpshufb %xmm7, %xmm3, %xmm7
+; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm8
+; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm7
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512-NEXT: vpsrlq $8, %zmm5, %zmm7
+; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
+; AVX512-NEXT: vpsrlq $8, %zmm2, %zmm7
; AVX512-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm8
-; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm7
+; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm8
+; AVX512-NEXT: vpshufb %xmm7, %xmm4, %xmm7
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX512-NEXT: vpshufb %xmm8, %xmm3, %xmm8
+; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm9
+; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm8
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512-NEXT: vpsrlq $16, %zmm5, %zmm8
+; AVX512-NEXT: vpsrlq $16, %zmm2, %zmm8
; AVX512-NEXT: vpmovqb %zmm8, %xmm8
; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm8
+; AVX512-NEXT: vpshufb %xmm8, %xmm6, %xmm9
+; AVX512-NEXT: vpshufb %xmm8, %xmm4, %xmm8
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm10
-; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm9
+; AVX512-NEXT: vpshufb %xmm9, %xmm0, %xmm10
+; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm9
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT: vpsrlq $24, %zmm5, %zmm9
+; AVX512-NEXT: vpsrlq $24, %zmm2, %zmm9
; AVX512-NEXT: vpmovqb %zmm9, %xmm9
; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512-NEXT: vpshufb %xmm9, %xmm0, %xmm10
-; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm9
+; AVX512-NEXT: vpshufb %xmm9, %xmm6, %xmm10
+; AVX512-NEXT: vpshufb %xmm9, %xmm4, %xmm9
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512-NEXT: vpshufb %xmm10, %xmm2, %xmm11
-; AVX512-NEXT: vpshufb %xmm10, %xmm3, %xmm10
+; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm11
+; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm10
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512-NEXT: vpsrlq $32, %zmm5, %zmm10
+; AVX512-NEXT: vpsrlq $32, %zmm2, %zmm10
; AVX512-NEXT: vpmovqb %zmm10, %xmm10
; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512-NEXT: vpshufb %xmm10, %xmm0, %xmm11
-; AVX512-NEXT: vpshufb %xmm10, %xmm1, %xmm10
+; AVX512-NEXT: vpshufb %xmm10, %xmm6, %xmm11
+; AVX512-NEXT: vpshufb %xmm10, %xmm4, %xmm10
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512-NEXT: vpshufb %xmm11, %xmm3, %xmm11
+; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm12
+; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm11
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512-NEXT: vpsrlq $40, %zmm5, %zmm11
+; AVX512-NEXT: vpsrlq $40, %zmm2, %zmm11
; AVX512-NEXT: vpmovqb %zmm11, %xmm11
; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX512-NEXT: vpshufb %xmm11, %xmm1, %xmm11
+; AVX512-NEXT: vpshufb %xmm11, %xmm6, %xmm12
+; AVX512-NEXT: vpshufb %xmm11, %xmm4, %xmm11
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX512-NEXT: vpshufb %xmm12, %xmm3, %xmm12
+; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm13
+; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm12
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512-NEXT: vpsrlq $48, %zmm5, %zmm12
+; AVX512-NEXT: vpsrlq $48, %zmm2, %zmm12
; AVX512-NEXT: vpmovqb %zmm12, %xmm12
; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm12, %xmm6, %xmm6
+; AVX512-NEXT: vpshufb %xmm12, %xmm4, %xmm4
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT: vpsrlq $56, %zmm5, %zmm1
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; AVX512-NEXT: vpsrlq $56, %zmm2, %zmm1
; AVX512-NEXT: vpmovqb %zmm1, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512-NEXT: vmovdqa %xmm6, (%rdx)
+; AVX512-NEXT: vmovdqa %xmm3, (%rsi)
+; AVX512-NEXT: vmovdqa %xmm5, (%rdx)
; AVX512-NEXT: vmovdqa %xmm7, (%rcx)
; AVX512-NEXT: vmovdqa %xmm8, (%r8)
; AVX512-NEXT: vmovdqa %xmm9, (%r9)
@@ -2842,74 +2842,76 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm9
-; AVX512-FCP-NEXT: vpmovqd %ymm9, %xmm8
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm0
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm10
-; AVX512-FCP-NEXT: vpmovqd %ymm10, %xmm11
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm11
+; AVX512-FCP-NEXT: vpmovqd %ymm11, %xmm7
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,10,12,14,12,14,14,15]
; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm12
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512-FCP-NEXT: vpermd %zmm13, %zmm1, %zmm14
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm1
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm1
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm1
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512-FCP-NEXT: vpsrlq $8, %zmm12, %zmm2
-; AVX512-FCP-NEXT: vpmovqb %zmm2, %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm12, %zmm14
-; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm14, %xmm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
+; AVX512-FCP-NEXT: vpsrlq $8, %zmm12, %zmm3
+; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm3
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm8[3]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm12, %zmm8
+; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
-; AVX512-FCP-NEXT: vpsrlq $24, %zmm12, %zmm11
-; AVX512-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7]
-; AVX512-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm9
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3
-; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm12, %zmm4
-; AVX512-FCP-NEXT: vpmovqb %zmm4, %xmm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm4
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512-FCP-NEXT: vpsrlq $40, %zmm12, %zmm5
-; AVX512-FCP-NEXT: vpmovqb %zmm5, %xmm5
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm5
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm14, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm14[3]
+; AVX512-FCP-NEXT: vpsrlq $24, %zmm12, %zmm14
+; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [9,11,13,15,13,15,14,15]
+; AVX512-FCP-NEXT: vpermd %zmm13, %zmm14, %zmm13
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm9
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,5,7,0,0,0,0]
+; AVX512-FCP-NEXT: vpermd %ymm11, %ymm14, %ymm11
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm9[3]
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm12, %zmm9
+; AVX512-FCP-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm6
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3]
+; AVX512-FCP-NEXT: vpsrlq $40, %zmm12, %zmm6
+; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm6
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
; AVX512-FCP-NEXT: vpsrlq $48, %zmm12, %zmm6
; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm6
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512-FCP-NEXT: vpsrlq $56, %zmm12, %zmm7
-; AVX512-FCP-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm6
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
+; AVX512-FCP-NEXT: vpsrlq $56, %zmm12, %zmm8
+; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa %xmm8, (%r8)
-; AVX512-FCP-NEXT: vmovdqa %xmm3, (%r9)
+; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %xmm7, (%r8)
+; AVX512-FCP-NEXT: vmovdqa %xmm2, (%r9)
; AVX512-FCP-NEXT: vmovdqa %xmm4, (%r11)
; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r10)
; AVX512-FCP-NEXT: vmovdqa %xmm6, (%rax)
@@ -2921,108 +2923,108 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm5
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512DQ-NEXT: vpmovqb %zmm5, %xmm6
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-NEXT: vpmovqb %zmm4, %xmm5
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm4
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
+; AVX512DQ-NEXT: vpmovqb %zmm2, %xmm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm6, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm7 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm3, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm7, %xmm0, %xmm8
+; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm7
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512DQ-NEXT: vpsrlq $8, %zmm5, %zmm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
+; AVX512DQ-NEXT: vpsrlq $8, %zmm2, %zmm7
; AVX512DQ-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm0, %xmm8
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm8
+; AVX512DQ-NEXT: vpshufb %xmm7, %xmm4, %xmm7
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm3, %xmm8
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm0, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm8
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512DQ-NEXT: vpsrlq $16, %zmm5, %zmm8
+; AVX512DQ-NEXT: vpsrlq $16, %zmm2, %zmm8
; AVX512DQ-NEXT: vpmovqb %zmm8, %xmm8
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm8
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm6, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm4, %xmm8
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm2, %xmm10
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm9, %xmm0, %xmm10
+; AVX512DQ-NEXT: vpshufb %xmm9, %xmm1, %xmm9
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT: vpsrlq $24, %zmm5, %zmm9
+; AVX512DQ-NEXT: vpsrlq $24, %zmm2, %zmm9
; AVX512DQ-NEXT: vpmovqb %zmm9, %xmm9
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm0, %xmm10
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm1, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm9, %xmm6, %xmm10
+; AVX512DQ-NEXT: vpshufb %xmm9, %xmm4, %xmm9
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm10 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm2, %xmm11
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm3, %xmm10
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm0, %xmm11
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm10
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512DQ-NEXT: vpsrlq $32, %zmm5, %zmm10
+; AVX512DQ-NEXT: vpsrlq $32, %zmm2, %zmm10
; AVX512DQ-NEXT: vpmovqb %zmm10, %xmm10
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm0, %xmm11
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm1, %xmm10
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm6, %xmm11
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm4, %xmm10
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm11 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512DQ-NEXT: vpshufb %xmm11, %xmm3, %xmm11
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm12
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm1, %xmm11
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512DQ-NEXT: vpsrlq $40, %zmm5, %zmm11
+; AVX512DQ-NEXT: vpsrlq $40, %zmm2, %zmm11
; AVX512DQ-NEXT: vpmovqb %zmm11, %xmm11
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm11 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX512DQ-NEXT: vpshufb %xmm11, %xmm1, %xmm11
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm6, %xmm12
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm4, %xmm11
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm12 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm3, %xmm12
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm13
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm12
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512DQ-NEXT: vpsrlq $48, %zmm5, %zmm12
+; AVX512DQ-NEXT: vpsrlq $48, %zmm2, %zmm12
; AVX512DQ-NEXT: vpmovqb %zmm12, %xmm12
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm6 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT: vpsrlq $56, %zmm5, %zmm1
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; AVX512DQ-NEXT: vpsrlq $56, %zmm2, %zmm1
; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512DQ-NEXT: vmovdqa %xmm6, (%rdx)
+; AVX512DQ-NEXT: vmovdqa %xmm3, (%rsi)
+; AVX512DQ-NEXT: vmovdqa %xmm5, (%rdx)
; AVX512DQ-NEXT: vmovdqa %xmm7, (%rcx)
; AVX512DQ-NEXT: vmovdqa %xmm8, (%r8)
; AVX512DQ-NEXT: vmovdqa %xmm9, (%r9)
@@ -3037,74 +3039,76 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT: vpmovqd %ymm9, %xmm8
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm10
-; AVX512DQ-FCP-NEXT: vpmovqd %ymm10, %xmm11
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vpmovqd %ymm11, %xmm7
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,10,12,14,12,14,14,15]
; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm13
+; AVX512DQ-FCP-NEXT: vpermd %zmm13, %zmm1, %zmm14
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm1
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm1
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm1
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm12, %zmm2
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm12, %zmm14
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm14, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm12, %zmm3
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm3
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm14, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm8[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm12, %zmm8
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm12, %zmm11
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm11, %xmm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,3,5,7,5,7,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm11, %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm3
-; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm10, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm12, %zmm4
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm9, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm12, %zmm5
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm14[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm12, %zmm14
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm14[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [9,11,13,15,13,15,14,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm13, %zmm14, %zmm13
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm9
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,5,7,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm14, %ymm11
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm11, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm9[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm12, %zmm9
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm12, %zmm6
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm12, %zmm6
; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm9, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm10, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm12, %zmm7
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm12, %zmm8
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%r9)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%r11)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r10)
; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%rax)
@@ -3117,107 +3121,107 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm1
+; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm1
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512BW-NEXT: vpshufb %xmm5, %xmm3, %xmm5
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512BW-NEXT: vpmovqb %zmm5, %xmm6
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512BW-NEXT: vpmovqb %zmm4, %xmm5
+; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm4
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
+; AVX512BW-NEXT: vpmovqb %zmm2, %xmm5
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm6
+; AVX512BW-NEXT: vpshufb %xmm5, %xmm6, %xmm7
+; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512BW-NEXT: vpshufb %xmm7, %xmm3, %xmm7
+; AVX512BW-NEXT: vpshufb %xmm7, %xmm0, %xmm8
+; AVX512BW-NEXT: vpshufb %xmm7, %xmm1, %xmm7
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512BW-NEXT: vpsrlq $8, %zmm5, %zmm7
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
+; AVX512BW-NEXT: vpsrlq $8, %zmm2, %zmm7
; AVX512BW-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: vpshufb %xmm7, %xmm0, %xmm8
-; AVX512BW-NEXT: vpshufb %xmm7, %xmm1, %xmm7
+; AVX512BW-NEXT: vpshufb %xmm7, %xmm6, %xmm8
+; AVX512BW-NEXT: vpshufb %xmm7, %xmm4, %xmm7
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm3, %xmm8
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm8
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512BW-NEXT: vpsrlq $16, %zmm5, %zmm8
+; AVX512BW-NEXT: vpsrlq $16, %zmm2, %zmm8
; AVX512BW-NEXT: vpmovqb %zmm8, %xmm8
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm8
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm6, %xmm9
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm4, %xmm8
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: vpshufb %xmm9, %xmm2, %xmm10
-; AVX512BW-NEXT: vpshufb %xmm9, %xmm3, %xmm9
+; AVX512BW-NEXT: vpshufb %xmm9, %xmm0, %xmm10
+; AVX512BW-NEXT: vpshufb %xmm9, %xmm1, %xmm9
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512BW-NEXT: vpsrlq $24, %zmm5, %zmm9
+; AVX512BW-NEXT: vpsrlq $24, %zmm2, %zmm9
; AVX512BW-NEXT: vpmovqb %zmm9, %xmm9
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: vpshufb %xmm9, %xmm0, %xmm10
-; AVX512BW-NEXT: vpshufb %xmm9, %xmm1, %xmm9
+; AVX512BW-NEXT: vpshufb %xmm9, %xmm6, %xmm10
+; AVX512BW-NEXT: vpshufb %xmm9, %xmm4, %xmm9
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm2, %xmm11
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm3, %xmm10
+; AVX512BW-NEXT: vpshufb %xmm10, %xmm0, %xmm11
+; AVX512BW-NEXT: vpshufb %xmm10, %xmm1, %xmm10
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512BW-NEXT: vpsrlq $32, %zmm5, %zmm10
+; AVX512BW-NEXT: vpsrlq $32, %zmm2, %zmm10
; AVX512BW-NEXT: vpmovqb %zmm10, %xmm10
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm0, %xmm11
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm1, %xmm10
+; AVX512BW-NEXT: vpshufb %xmm10, %xmm6, %xmm11
+; AVX512BW-NEXT: vpshufb %xmm10, %xmm4, %xmm10
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512BW-NEXT: vpshufb %xmm11, %xmm3, %xmm11
+; AVX512BW-NEXT: vpshufb %xmm11, %xmm0, %xmm12
+; AVX512BW-NEXT: vpshufb %xmm11, %xmm1, %xmm11
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512BW-NEXT: vpsrlq $40, %zmm5, %zmm11
+; AVX512BW-NEXT: vpsrlq $40, %zmm2, %zmm11
; AVX512BW-NEXT: vpmovqb %zmm11, %xmm11
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX512BW-NEXT: vpshufb %xmm11, %xmm1, %xmm11
+; AVX512BW-NEXT: vpshufb %xmm11, %xmm6, %xmm12
+; AVX512BW-NEXT: vpshufb %xmm11, %xmm4, %xmm11
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm3, %xmm12
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm0, %xmm13
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm1, %xmm12
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512BW-NEXT: vpsrlq $48, %zmm5, %zmm12
+; AVX512BW-NEXT: vpsrlq $48, %zmm2, %zmm12
; AVX512BW-NEXT: vpmovqb %zmm12, %xmm12
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm6, %xmm6
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm4, %xmm4
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512BW-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vpsrlq $56, %zmm5, %zmm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; AVX512BW-NEXT: vpsrlq $56, %zmm2, %zmm1
; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512BW-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512BW-NEXT: vmovdqa %xmm6, (%rdx)
+; AVX512BW-NEXT: vmovdqa %xmm3, (%rsi)
+; AVX512BW-NEXT: vmovdqa %xmm5, (%rdx)
; AVX512BW-NEXT: vmovdqa %xmm7, (%rcx)
; AVX512BW-NEXT: vmovdqa %xmm8, (%r8)
; AVX512BW-NEXT: vmovdqa %xmm9, (%r9)
@@ -3232,67 +3236,69 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm4
; AVX512BW-FCP-NEXT: vpmovqd %ymm4, %xmm3
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0
-; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
-; AVX512BW-FCP-NEXT: vpmovqd %ymm6, %xmm7
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9
-; AVX512BW-FCP-NEXT: vpmovqb %zmm9, %xmm1
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,10,12,14,12,14,14,15]
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7
+; AVX512BW-FCP-NEXT: vpermd %zmm7, %zmm1, %zmm8
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm1
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512BW-FCP-NEXT: vpmovqb %zmm6, %xmm1
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm1
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm9, %zmm2
+; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm6, %zmm2
; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
-; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm9, %zmm14
+; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm14[3]
+; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm6, %zmm14
; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
-; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm9, %zmm7
-; AVX512BW-FCP-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7]
-; AVX512BW-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm4
+; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm8[3]
+; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm6, %zmm8
+; AVX512BW-FCP-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [9,11,13,15,13,15,14,15]
+; AVX512BW-FCP-NEXT: vpermd %zmm7, %zmm8, %zmm7
+; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm8
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,5,7,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermd %ymm4, %ymm9, %ymm4
; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5
-; AVX512BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6
-; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm7
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
-; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm9, %zmm7
-; AVX512BW-FCP-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
-; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm7
-; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm8
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm9, %zmm8
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
+; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm6, %zmm8
; AVX512BW-FCP-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm8
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3]
-; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm9, %zmm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
+; AVX512BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm8
+; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm9
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
+; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm6, %zmm9
+; AVX512BW-FCP-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm9
+; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm10
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
+; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm6, %zmm10
; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7
; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4
-; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
-; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm9, %zmm6
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
+; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm6, %zmm6
; AVX512BW-FCP-NEXT: vpmovqb %zmm6, %xmm6
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
@@ -3300,8 +3306,8 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%r8)
; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r9)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%r11)
-; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r11)
+; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%r10)
; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
@@ -3312,107 +3318,107 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-NEXT: vmovdqa 112(%rdi), %xmm0
+; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm0
; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm0, %xmm3
-; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm1
+; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm1
; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm1, %xmm2
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm2, %xmm6
-; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm3, %xmm5
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512DQ-BW-NEXT: vpmovqb %zmm5, %xmm6
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm0, %xmm7
-; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm1, %xmm6
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4
+; AVX512DQ-BW-NEXT: vpmovqb %zmm4, %xmm5
+; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm4
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm6 = xmm4[u,u,u,u,u,u,0,8,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
+; AVX512DQ-BW-NEXT: vpmovqb %zmm2, %xmm5
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-NEXT: vmovdqa 112(%rdi), %xmm6
+; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm6, %xmm7
+; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm2, %xmm8
-; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm3, %xmm7
+; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm0, %xmm8
+; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm1, %xmm7
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm5, %zmm7
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
+; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm2, %zmm7
; AVX512DQ-BW-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm0, %xmm8
-; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm1, %xmm7
+; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm6, %xmm8
+; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm4, %xmm7
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm2, %xmm9
-; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm3, %xmm8
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm1, %xmm8
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm5, %zmm8
+; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm2, %zmm8
; AVX512DQ-BW-NEXT: vpmovqb %zmm8, %xmm8
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm0, %xmm9
-; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm1, %xmm8
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm6, %xmm9
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm4, %xmm8
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm2, %xmm10
-; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm3, %xmm9
+; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm0, %xmm10
+; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm1, %xmm9
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm5, %zmm9
+; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm2, %zmm9
; AVX512DQ-BW-NEXT: vpmovqb %zmm9, %xmm9
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm0, %xmm10
-; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm1, %xmm9
+; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm6, %xmm10
+; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm4, %xmm9
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm2, %xmm11
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm3, %xmm10
+; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm0, %xmm11
+; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm1, %xmm10
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm5, %zmm10
+; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm2, %zmm10
; AVX512DQ-BW-NEXT: vpmovqb %zmm10, %xmm10
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm0, %xmm11
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm1, %xmm10
+; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm6, %xmm11
+; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm4, %xmm10
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm3, %xmm11
+; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm0, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm1, %xmm11
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm5, %zmm11
+; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm2, %zmm11
; AVX512DQ-BW-NEXT: vpmovqb %zmm11, %xmm11
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm11 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm0, %xmm12
-; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm1, %xmm11
+; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm6, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb %xmm11, %xmm4, %xmm11
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm2, %xmm13
-; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm3, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm0, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm1, %xmm12
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3]
-; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm5, %zmm12
+; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm2, %zmm12
; AVX512DQ-BW-NEXT: vpmovqb %zmm12, %xmm12
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm6, %xmm6
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm4, %xmm4
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm1, %xmm1
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm3, %xmm1
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm5, %zmm1
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm2, %zmm1
; AVX512DQ-BW-NEXT: vpmovqb %zmm1, %xmm1
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rsi)
-; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%rdx)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rsi)
+; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%rdx)
; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%rcx)
; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%r8)
; AVX512DQ-BW-NEXT: vmovdqa %xmm9, (%r9)
@@ -3427,67 +3433,69 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm4
; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm4, %xmm3
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm6, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm9, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [8,10,12,14,12,14,14,15]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm7, %zmm1, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm6, %xmm1
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm1
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm9, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm6, %zmm2
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm9, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm8, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm14[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm6, %zmm14
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm9, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,5,7,5,7,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm8[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm6, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [9,11,13,15,13,15,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm7, %zmm8, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,5,7,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm4, %ymm9, %ymm4
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm9, %zmm7
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm9, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm6, %zmm8
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1,2],xmm8[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm9, %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm6, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm7, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm4, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm6, %zmm10
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm7
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm9, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm6, %zmm6
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm6, %xmm6
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
@@ -3495,8 +3503,8 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rcx)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%r8)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r9)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%r11)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r11)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%r10)
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
@@ -5975,215 +5983,223 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-LABEL: load_i8_stride8_vf32:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm16
-; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
+; AVX512-NEXT: vmovdqa 208(%rdi), %xmm2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm4
+; AVX512-NEXT: vmovdqa 192(%rdi), %xmm14
+; AVX512-NEXT: vpshufb %xmm5, %xmm14, %xmm6
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vmovdqa 224(%rdi), %xmm11
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512-NEXT: vpshufb %xmm8, %xmm11, %xmm6
+; AVX512-NEXT: vpmovqb %zmm3, %xmm3
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512-NEXT: vpmovqb %ymm4, %xmm4
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vmovdqa 160(%rdi), %ymm6
+; AVX512-NEXT: vpmovqb %ymm6, %xmm9
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
+; AVX512-NEXT: vpmovqb %ymm6, %xmm6
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm1
+; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm18
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm5
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm19
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm7
+; AVX512-NEXT: vpshufb %xmm8, %xmm7, %xmm8
; AVX512-NEXT: vpmovqb %zmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
+; AVX512-NEXT: vpmovqb %zmm16, %xmm5
+; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm17
+; AVX512-NEXT: vmovdqa 128(%rdi), %xmm9
+; AVX512-NEXT: vmovdqa 160(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 176(%rdi), %xmm10
; AVX512-NEXT: vmovdqa 240(%rdi), %xmm12
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm1
-; AVX512-NEXT: vmovdqa 224(%rdi), %xmm7
-; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm5
+; AVX512-NEXT: vmovdqa %xmm11, %xmm6
+; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm11
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm13 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512-NEXT: vpshufb %xmm13, %xmm2, %xmm11
+; AVX512-NEXT: vmovdqa %xmm14, %xmm4
+; AVX512-NEXT: vpshufb %xmm13, %xmm14, %xmm14
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-NEXT: vmovd {{.*#+}} xmm11 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm11, %xmm10, %xmm14
+; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm11
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm22
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
+; AVX512-NEXT: vmovdqa 144(%rdi), %xmm11
+; AVX512-NEXT: vmovd {{.*#+}} xmm15 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm15, %xmm11, %xmm0
+; AVX512-NEXT: vpshufb %xmm15, %xmm9, %xmm15
+; AVX512-NEXT: vmovdqa64 %xmm9, %xmm20
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm3
+; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm5
+; AVX512-NEXT: vmovdqa64 %xmm3, %xmm24
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512-NEXT: vmovdqa %xmm7, %xmm8
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX512-NEXT: vmovdqa64 %xmm18, %xmm9
+; AVX512-NEXT: vpshufb %xmm13, %xmm9, %xmm5
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm3
+; AVX512-NEXT: vpshufb %xmm13, %xmm3, %xmm13
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3]
+; AVX512-NEXT: vpsrlq $8, %zmm16, %zmm5
+; AVX512-NEXT: vpmovqb %zmm5, %xmm5
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512-NEXT: vpshufb %xmm7, %xmm12, %xmm1
+; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm5
+; AVX512-NEXT: vmovdqa64 %xmm6, %xmm21
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm13
+; AVX512-NEXT: vmovdqa %xmm2, %xmm6
+; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vmovdqa 208(%rdi), %xmm9
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512-NEXT: vpshufb %xmm4, %xmm9, %xmm2
-; AVX512-NEXT: vmovdqa 192(%rdi), %xmm8
-; AVX512-NEXT: vpshufb %xmm4, %xmm8, %xmm6
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT: vmovd {{.*#+}} xmm13 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm13, %xmm10, %xmm15
+; AVX512-NEXT: vmovdqa64 %xmm22, %xmm0
+; AVX512-NEXT: vpshufb %xmm13, %xmm0, %xmm13
+; AVX512-NEXT: vmovdqa64 %xmm22, %xmm23
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512-NEXT: vmovd {{.*#+}} xmm15 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm15, %xmm11, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm20, %xmm14
+; AVX512-NEXT: vpshufb %xmm15, %xmm14, %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX512-NEXT: vpmovqb %ymm2, %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm5
-; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm6
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm1
-; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm18
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm1
-; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm11
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm21
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3]
-; AVX512-NEXT: vpmovqb %zmm16, %xmm10
-; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm0, %ymm17
-; AVX512-NEXT: vmovdqa 160(%rdi), %xmm10
-; AVX512-NEXT: vmovdqa 176(%rdi), %xmm6
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm12, %xmm19
-; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm12
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512-NEXT: vpshufb %xmm14, %xmm9, %xmm12
-; AVX512-NEXT: vmovdqa64 %xmm9, %xmm22
-; AVX512-NEXT: vpshufb %xmm14, %xmm8, %xmm13
-; AVX512-NEXT: vmovdqa64 %xmm8, %xmm20
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT: vmovd {{.*#+}} xmm12 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm12, %xmm6, %xmm13
-; AVX512-NEXT: vpshufb %xmm12, %xmm10, %xmm12
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512-NEXT: vmovdqa 128(%rdi), %xmm8
-; AVX512-NEXT: vmovdqa 144(%rdi), %xmm13
-; AVX512-NEXT: vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm0
-; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm8, %xmm26
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm1
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm5, %xmm24
-; AVX512-NEXT: vmovdqa64 %xmm18, %xmm5
-; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1
+; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm7, %xmm8, %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm4
-; AVX512-NEXT: vpshufb %xmm14, %xmm4, %xmm2
-; AVX512-NEXT: vpshufb %xmm14, %xmm3, %xmm4
-; AVX512-NEXT: vmovdqa %xmm3, %xmm11
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX512-NEXT: vpshufb %xmm5, %xmm9, %xmm2
+; AVX512-NEXT: vmovdqa64 %xmm9, %xmm22
+; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm20
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512-NEXT: vpsrlq $8, %zmm16, %zmm2
+; AVX512-NEXT: vpsrlq $16, %zmm16, %zmm2
; AVX512-NEXT: vpmovqb %zmm2, %xmm2
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512-NEXT: vmovdqa64 %xmm19, %xmm3
-; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm1
-; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm2
-; AVX512-NEXT: vmovdqa %xmm7, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm8
-; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm7
-; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm14
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm14
-; AVX512-NEXT: vmovdqa %xmm6, %xmm12
-; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm4
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512-NEXT: vmovd {{.*#+}} xmm14 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm14, %xmm13, %xmm15
-; AVX512-NEXT: vmovdqa64 %xmm26, %xmm6
-; AVX512-NEXT: vpshufb %xmm14, %xmm6, %xmm14
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vmovdqa64 %xmm24, %xmm4
-; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm5, %xmm20
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm6
-; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm4
-; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512-NEXT: vpsrlq $16, %zmm16, %zmm2
-; AVX512-NEXT: vpmovqb %zmm2, %xmm2
-; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm19
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm3, %xmm25
+; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm9
; AVX512-NEXT: vpshufb %xmm0, %xmm9, %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm4
-; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm14
-; AVX512-NEXT: vmovdqa64 %xmm7, %xmm22
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm5
+; AVX512-NEXT: vmovdqa64 %xmm6, %xmm28
+; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm13
+; AVX512-NEXT: vmovdqa64 %xmm4, %xmm21
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT: vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm14
-; AVX512-NEXT: vmovdqa64 %xmm12, %xmm27
-; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm4
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512-NEXT: vmovd {{.*#+}} xmm14 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm14, %xmm13, %xmm15
-; AVX512-NEXT: vmovdqa64 %xmm26, %xmm12
-; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm14
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vmovdqa64 %xmm24, %xmm5
-; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm4
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT: vmovd {{.*#+}} xmm5 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vmovdqa %xmm10, %xmm6
+; AVX512-NEXT: vpshufb %xmm5, %xmm10, %xmm13
+; AVX512-NEXT: vmovdqa64 %xmm23, %xmm3
+; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; AVX512-NEXT: vmovd {{.*#+}} xmm13 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm13, %xmm11, %xmm15
+; AVX512-NEXT: vmovdqa %xmm14, %xmm10
+; AVX512-NEXT: vpshufb %xmm13, %xmm14, %xmm13
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5],ymm13[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm14
+; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm5
+; AVX512-NEXT: vpshufb %xmm0, %xmm8, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm8, %xmm29
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512-NEXT: vmovdqa64 %xmm22, %xmm4
+; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm5
; AVX512-NEXT: vmovdqa64 %xmm20, %xmm15
-; AVX512-NEXT: vpshufb %xmm0, %xmm15, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm4
-; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm2
-; AVX512-NEXT: vmovdqa %xmm11, %xmm6
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512-NEXT: vpshufb %xmm2, %xmm15, %xmm2
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
; AVX512-NEXT: vpsrlq $24, %zmm16, %zmm2
; AVX512-NEXT: vpmovqb %zmm2, %xmm2
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm20
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm11
-; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm1
-; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm2
-; AVX512-NEXT: vmovdqa %xmm9, %xmm7
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512-NEXT: vpshufb %xmm8, %xmm12, %xmm1
+; AVX512-NEXT: vpshufb %xmm8, %xmm9, %xmm2
+; AVX512-NEXT: vmovdqa64 %xmm9, %xmm24
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm9
-; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm14
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT: vmovdqa64 %xmm28, %xmm0
+; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm5
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm9
+; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm13
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm27, %xmm0
-; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm14
-; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm10, %xmm22
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm14, %xmm13, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm13, %xmm25
-; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm14
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-NEXT: vmovd {{.*#+}} xmm5 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vmovdqa %xmm6, %xmm7
+; AVX512-NEXT: vpshufb %xmm5, %xmm6, %xmm13
+; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX512-NEXT: vmovdqa64 %xmm23, %xmm25
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; AVX512-NEXT: vmovd {{.*#+}} xmm13 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm13, %xmm11, %xmm0
+; AVX512-NEXT: vmovdqa %xmm10, %xmm6
+; AVX512-NEXT: vpshufb %xmm13, %xmm10, %xmm13
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm1
-; AVX512-NEXT: vpshufb %xmm3, %xmm15, %xmm3
+; AVX512-NEXT: vpshufb %xmm8, %xmm14, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm14, %xmm26
+; AVX512-NEXT: vmovdqa64 %xmm29, %xmm3
+; AVX512-NEXT: vpshufb %xmm8, %xmm3, %xmm3
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm10
-; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm3
-; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm2
-; AVX512-NEXT: vmovdqa64 %xmm6, %xmm23
+; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm3
+; AVX512-NEXT: vpshufb %xmm2, %xmm15, %xmm2
+; AVX512-NEXT: vmovdqa64 %xmm15, %xmm23
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
; AVX512-NEXT: vpsrlq $32, %zmm16, %zmm2
@@ -6191,43 +6207,46 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512-NEXT: vpshufb %xmm5, %xmm11, %xmm1
-; AVX512-NEXT: vpshufb %xmm5, %xmm7, %xmm2
-; AVX512-NEXT: vmovdqa %xmm7, %xmm13
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm12, %xmm27
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm10
+; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512-NEXT: vmovdqa64 %xmm28, %xmm8
; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm8, %xmm26
-; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm4
-; AVX512-NEXT: vmovdqa %xmm9, %xmm14
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm5
+; AVX512-NEXT: vmovdqa %xmm9, %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm27, %xmm6
-; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm7
+; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm5
+; AVX512-NEXT: vmovdqa %xmm7, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm7
; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-NEXT: vmovd {{.*#+}} xmm4 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm9
-; AVX512-NEXT: vpshufb %xmm4, %xmm9, %xmm0
-; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm4
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vpshufb %xmm5, %xmm11, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm11, %xmm24
+; AVX512-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX512-NEXT: vmovdqa %xmm6, %xmm13
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vmovdqa64 %xmm24, %xmm8
-; AVX512-NEXT: vpshufb %xmm5, %xmm8, %xmm1
-; AVX512-NEXT: vpshufb %xmm5, %xmm15, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm11
+; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm29, %xmm5
+; AVX512-NEXT: vpshufb %xmm4, %xmm5, %xmm3
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm10, %xmm24
-; AVX512-NEXT: vmovdqa64 %xmm23, %xmm4
-; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX512-NEXT: vmovdqa64 %xmm22, %xmm6
+; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm23, %xmm9
+; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
; AVX512-NEXT: vpsrlq $40, %zmm16, %zmm2
@@ -6235,41 +6254,41 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512-NEXT: vpshufb %xmm10, %xmm11, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm11, %xmm23
-; AVX512-NEXT: vpshufb %xmm10, %xmm13, %xmm2
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512-NEXT: vmovdqa64 %xmm27, %xmm0
+; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm1
+; AVX512-NEXT: vpshufb %xmm12, %xmm10, %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm26, %xmm11
-; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm3
-; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm5
-; AVX512-NEXT: vmovdqa64 %xmm14, %xmm25
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm3
+; AVX512-NEXT: vpshufb %xmm2, %xmm15, %xmm4
+; AVX512-NEXT: vmovdqa64 %xmm15, %xmm25
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm5
+; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4
; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm5, %xmm9, %xmm0
-; AVX512-NEXT: vmovdqa %xmm9, %xmm14
-; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm5
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; AVX512-NEXT: vmovdqa64 %xmm7, %xmm23
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-NEXT: vmovd {{.*#+}} xmm4 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm7
+; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm0
+; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm4
+; AVX512-NEXT: vmovdqa %xmm13, %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vpshufb %xmm10, %xmm8, %xmm1
-; AVX512-NEXT: vmovdqa %xmm8, %xmm9
-; AVX512-NEXT: vpshufb %xmm10, %xmm15, %xmm3
+; AVX512-NEXT: vpshufb %xmm12, %xmm11, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm13
+; AVX512-NEXT: vpshufb %xmm12, %xmm5, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm29, %xmm11
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-NEXT: vmovdqa64 %xmm24, %xmm10
-; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm3
-; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX512-NEXT: vmovdqa %xmm4, %xmm8
+; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm3
+; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
; AVX512-NEXT: vpsrlq $48, %zmm16, %zmm2
@@ -6277,12 +6296,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512-NEXT: vmovdqa64 %xmm23, %xmm2
+; AVX512-NEXT: vmovdqa64 %xmm27, %xmm2
; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm3
+; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm3
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm8, %xmm4
; AVX512-NEXT: vmovdqa64 %xmm25, %xmm5
; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
@@ -6290,22 +6309,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
; AVX512-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm5
-; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm4
+; AVX512-NEXT: vpshufb %xmm4, %xmm14, %xmm5
+; AVX512-NEXT: vmovdqa64 %xmm23, %xmm8
+; AVX512-NEXT: vpshufb %xmm4, %xmm8, %xmm4
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX512-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm5, %xmm14, %xmm7
-; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm5
+; AVX512-NEXT: vpshufb %xmm5, %xmm7, %xmm7
+; AVX512-NEXT: vpshufb %xmm5, %xmm15, %xmm5
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm4
-; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm1
+; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm4
+; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm1
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm4
-; AVX512-NEXT: vpshufb %xmm3, %xmm8, %xmm3
+; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm3
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
; AVX512-NEXT: vpsrlq $56, %zmm16, %zmm3
@@ -6328,184 +6348,188 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512-FCP-LABEL: load_i8_stride8_vf32:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm20
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm19
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm16
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm17
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16
-; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
-; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18
-; AVX512-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm6
+; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
+; AVX512-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm2
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [8,10,10,11,8,10,12,14]
+; AVX512-FCP-NEXT: vpermd %zmm18, %zmm4, %zmm3
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm5[7]
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
-; AVX512-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
+; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm14
+; AVX512-FCP-NEXT: vpermd %ymm14, %ymm0, %ymm6
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm0
+; AVX512-FCP-NEXT: vpermd %zmm17, %zmm4, %zmm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm12
-; AVX512-FCP-NEXT: vpmovqd %ymm12, %xmm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm19
-; AVX512-FCP-NEXT: vpmovqd %ymm19, %xmm9
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512-FCP-NEXT: vpmovqb %zmm20, %xmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20
+; AVX512-FCP-NEXT: vpmovqd %ymm20, %xmm4
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm7
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,10,12,14,12,14,14,15]
+; AVX512-FCP-NEXT: vpermd %zmm16, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
+; AVX512-FCP-NEXT: vpmovqb %zmm19, %xmm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512-FCP-NEXT: vpsrlq $8, %zmm20, %zmm6
-; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
+; AVX512-FCP-NEXT: vpsrlq $8, %zmm19, %zmm9
+; AVX512-FCP-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm31
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm5
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm30
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm5
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm29
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm6
-; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm28
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm5
-; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm27
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm6
-; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm26
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm20, %zmm6
-; AVX512-FCP-NEXT: vpmovqb %zmm6, %xmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm25
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm24
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm31
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm7
+; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm30
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm7
+; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm29
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm9
+; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm27
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm7
+; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm28
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm9
+; AVX512-FCP-NEXT: vmovdqa64 %xmm10, %xmm25
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm19, %zmm9
+; AVX512-FCP-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm26
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm24
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm1
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512-FCP-NEXT: vpsrlq $24, %zmm20, %zmm2
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm1
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512-FCP-NEXT: vpsrlq $24, %zmm19, %zmm2
; AVX512-FCP-NEXT: vpmovqb %zmm2, %xmm2
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm1
-; AVX512-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm9
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm14
-; AVX512-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm0
+; AVX512-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm1
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [9,11,10,11,9,11,13,15]
+; AVX512-FCP-NEXT: vpermd %zmm18, %zmm11, %zmm4
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT: vpermd %ymm14, %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vpermd %zmm17, %zmm11, %zmm11
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7]
-; AVX512-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm14
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm20, %zmm15
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [9,11,13,15,13,15,14,15]
+; AVX512-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm14
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,0,0,0,0]
+; AVX512-FCP-NEXT: vpermd %ymm20, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1,2],xmm8[3]
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm19, %zmm15
; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512-FCP-NEXT: vpsrlq $40, %zmm20, %zmm15
+; AVX512-FCP-NEXT: vpsrlq $40, %zmm19, %zmm15
; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm10
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm13
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm10
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm10
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm15
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm10
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm10
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm13
+; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm12
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm13
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm12
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm12
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm12
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5],ymm15[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm13
; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13
-; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm15
+; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm15
; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm15
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512-FCP-NEXT: vpsrlq $48, %zmm20, %zmm15
+; AVX512-FCP-NEXT: vpsrlq $48, %zmm19, %zmm15
; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm13
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm13
+; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2
; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm13
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm7
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm7
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm14, %xmm5
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
-; AVX512-FCP-NEXT: vpsrlq $56, %zmm20, %zmm3
-; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm14, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; AVX512-FCP-NEXT: vpsrlq $56, %zmm19, %zmm4
+; AVX512-FCP-NEXT: vpmovqb %zmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%rsi)
; AVX512-FCP-NEXT: vmovdqa64 %ymm22, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r8)
-; AVX512-FCP-NEXT: vmovdqa %ymm12, (%r9)
+; AVX512-FCP-NEXT: vmovdqa %ymm10, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa %ymm3, (%r8)
+; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r9)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa %ymm10, (%rax)
+; AVX512-FCP-NEXT: vmovdqa %ymm12, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512-FCP-NEXT: vzeroupper
@@ -6514,215 +6538,223 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-LABEL: load_i8_stride8_vf32:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm16
-; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm0
+; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3
+; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm2
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm4
+; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm14, %xmm6
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm11
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm11, %xmm6
+; AVX512DQ-NEXT: vpmovqb %zmm3, %xmm3
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512DQ-NEXT: vpmovqb %ymm4, %xmm4
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm6
+; AVX512DQ-NEXT: vpmovqb %ymm6, %xmm9
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
+; AVX512DQ-NEXT: vpmovqb %ymm6, %xmm6
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm18
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm1, %xmm5
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm19
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm7, %xmm8
; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
+; AVX512DQ-NEXT: vpmovqb %zmm16, %xmm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm17
+; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm9
+; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm0
+; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm10
; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm12
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm12, %xmm1
-; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm7
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm2
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm12, %xmm5
+; AVX512DQ-NEXT: vmovdqa %xmm11, %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm11
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm13 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm2, %xmm11
+; AVX512DQ-NEXT: vmovdqa %xmm14, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm14, %xmm14
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm11 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm10, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm11
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm22
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3]
+; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm11
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm15 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm15, %xmm11, %xmm0
+; AVX512DQ-NEXT: vpshufb %xmm15, %xmm9, %xmm15
+; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm20
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm5
+; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm24
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-NEXT: vmovdqa %xmm7, %xmm8
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm9, %xmm5
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm3, %xmm13
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3]
+; AVX512DQ-NEXT: vpsrlq $8, %zmm16, %zmm5
+; AVX512DQ-NEXT: vpmovqb %zmm5, %xmm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm18
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512DQ-NEXT: vpshufb %xmm7, %xmm12, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm5
+; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm21
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm13
+; AVX512DQ-NEXT: vmovdqa %xmm2, %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm4, %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm9
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm9, %xmm2
-; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm8
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm8, %xmm6
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm13 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm15
+; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm0, %xmm13
+; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm23
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm15 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm15, %xmm11, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm15, %xmm14, %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2
-; AVX512DQ-NEXT: vpmovqb %ymm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm6
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm1
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm18
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm1
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm1, %xmm11
-; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm21
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3]
-; AVX512DQ-NEXT: vpmovqb %zmm16, %xmm10
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1],xmm4[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm17
-; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm10
-; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm6
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm12, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm19
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm12
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm14 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm9, %xmm12
-; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm22
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm8, %xmm13
-; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm20
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT: vmovd {{.*#+}} xmm12 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm6, %xmm13
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm10, %xmm12
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm8
-; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm13
-; AVX512DQ-NEXT: vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm13, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm8, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm26
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm24
-; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm7, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm7, %xmm8, %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm4, %xmm2
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm3, %xmm4
-; AVX512DQ-NEXT: vmovdqa %xmm3, %xmm11
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm9, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm22
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm20
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-NEXT: vpsrlq $8, %zmm16, %zmm2
+; AVX512DQ-NEXT: vpsrlq $16, %zmm16, %zmm2
; AVX512DQ-NEXT: vpmovqb %zmm2, %xmm2
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm18
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm1
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm2
-; AVX512DQ-NEXT: vmovdqa %xmm7, %xmm9
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm8
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm7
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm14
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT: vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm6, %xmm14
-; AVX512DQ-NEXT: vmovdqa %xmm6, %xmm12
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm10, %xmm4
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512DQ-NEXT: vmovd {{.*#+}} xmm14 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm13, %xmm15
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm6
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm6, %xmm14
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm20
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm6
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm2
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512DQ-NEXT: vpsrlq $16, %zmm16, %zmm2
-; AVX512DQ-NEXT: vpmovqb %zmm2, %xmm2
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm19
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm25
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm9
; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm14
-; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm22
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm5
+; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm28
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm13
+; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm21
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT: vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm12, %xmm14
-; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm27
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm10, %xmm4
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512DQ-NEXT: vmovd {{.*#+}} xmm14 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm13, %xmm15
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm12
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm14
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4],ymm4[5],ymm14[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm4
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm5 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm10, %xmm13
+; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm13 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm11, %xmm15
+; AVX512DQ-NEXT: vmovdqa %xmm14, %xmm10
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm14, %xmm13
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5],ymm13[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm14, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm8, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm29
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm5
; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm15
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm15, %xmm0
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm2
-; AVX512DQ-NEXT: vmovdqa %xmm11, %xmm6
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm15, %xmm2
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
; AVX512DQ-NEXT: vpsrlq $24, %zmm16, %zmm2
; AVX512DQ-NEXT: vpmovqb %zmm2, %xmm2
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm20
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm11
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm11, %xmm1
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm2
-; AVX512DQ-NEXT: vmovdqa %xmm9, %xmm7
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm12, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm9, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm24
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm9
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm14
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm0
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm5
+; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm13
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-NEXT: vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm14
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm10, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm10, %xmm22
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512DQ-NEXT: vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm13, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm25
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm14
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm5 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vmovdqa %xmm6, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm6, %xmm13
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm25
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm13 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm11, %xmm0
+; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm13
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm1
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm15, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm14, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm26
+; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm3, %xmm3
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm10
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm2
-; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm23
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm15, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm23
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
; AVX512DQ-NEXT: vpsrlq $32, %zmm16, %zmm2
@@ -6730,43 +6762,46 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm11, %xmm1
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm7, %xmm2
-; AVX512DQ-NEXT: vmovdqa %xmm7, %xmm13
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm12, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm27
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm10
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm10, %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm8
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm26
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm4
-; AVX512DQ-NEXT: vmovdqa %xmm9, %xmm14
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm5
+; AVX512DQ-NEXT: vmovdqa %xmm9, %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm6
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm5
+; AVX512DQ-NEXT: vmovdqa %xmm7, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm7
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-NEXT: vmovd {{.*#+}} xmm4 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm9
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm9, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm12, %xmm4
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm11, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm24
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm6, %xmm5
+; AVX512DQ-NEXT: vmovdqa %xmm6, %xmm13
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm8
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm8, %xmm1
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm15, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm11
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm11, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm5, %xmm3
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm10, %xmm24
-; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
; AVX512DQ-NEXT: vpsrlq $40, %zmm16, %zmm2
@@ -6774,41 +6809,41 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm11, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm23
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm13, %xmm2
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm0
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm10, %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm11
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm14, %xmm5
-; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm25
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm15, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm25
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm4
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512DQ-NEXT: vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm9, %xmm0
-; AVX512DQ-NEXT: vmovdqa %xmm9, %xmm14
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm12, %xmm5
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm23
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-NEXT: vmovd {{.*#+}} xmm4 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm7, %xmm0
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm13, %xmm4
+; AVX512DQ-NEXT: vmovdqa %xmm13, %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm8, %xmm1
-; AVX512DQ-NEXT: vmovdqa %xmm8, %xmm9
-; AVX512DQ-NEXT: vpshufb %xmm10, %xmm15, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm11, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm13
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm5, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm11
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm10
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm2
-; AVX512DQ-NEXT: vmovdqa %xmm4, %xmm8
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
; AVX512DQ-NEXT: vpsrlq $48, %zmm16, %zmm2
@@ -6816,12 +6851,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm2
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm13, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm3
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm11, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm8, %xmm4
; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm5
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm5
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
@@ -6829,22 +6864,23 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm6, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm7, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm14, %xmm5
+; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm8
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm8, %xmm4
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm14, %xmm7
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm12, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm7, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm15, %xmm5
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm13, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm1
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm10, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm8, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm3
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
; AVX512DQ-NEXT: vpsrlq $56, %zmm16, %zmm3
@@ -6867,184 +6903,188 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-FCP-LABEL: load_i8_stride8_vf32:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm20
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm19
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm16
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm18
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16
-; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
-; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm18
-; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
+; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm5[7]
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
-; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm14
+; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm0
+; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm4, %zmm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm12
-; AVX512DQ-FCP-NEXT: vpmovqd %ymm12, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm19
-; AVX512DQ-FCP-NEXT: vpmovqd %ymm19, %xmm9
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm20, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm20
+; AVX512DQ-FCP-NEXT: vpmovqd %ymm20, %xmm4
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm7
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,10,12,14,12,14,14,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm19, %xmm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm9[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm20, %zmm6
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm5[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm19, %zmm9
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm31
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm30
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm29
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm28
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm7, %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm27
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm9, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm26
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm20, %zmm6
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm25
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm24
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm31
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm30
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm29
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm27
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm28
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm10, %xmm25
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm19, %zmm9
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm26
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm24
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm1
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm9, %xmm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm20, %zmm2
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm1
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm19, %zmm2
; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, %xmm2
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm16, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm14
-; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm0
+; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm18, %zmm11, %zmm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vpermd %zmm17, %zmm11, %zmm11
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,5,7,6,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm20, %zmm15
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [9,11,13,15,13,15,14,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm16, %zmm0, %zmm14
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,7,0,0,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm20, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1,2],xmm8[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm19, %zmm15
; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm0[u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm20, %zmm15
+; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm19, %zmm15
; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm10
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm13
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm10
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm10
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm10
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm10
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5],ymm15[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm13
; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm13
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm15
; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm15
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm20, %zmm15
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm19, %zmm15
; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm13
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm14, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm20, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm14, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm19, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%rsi)
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r9)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512DQ-FCP-NEXT: vzeroupper
@@ -7056,291 +7096,296 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm16
-; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0
-; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa 240(%rdi), %xmm2
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm1
-; AVX512BW-NEXT: vmovdqa 224(%rdi), %xmm6
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm6, %xmm3
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm7
+; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
+; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm1
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT: vpshufb %xmm5, %xmm7, %xmm3
-; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm9
-; AVX512BW-NEXT: vpshufb %xmm5, %xmm9, %xmm8
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm3
-; AVX512BW-NEXT: vpmovqb %ymm3, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm5, %xmm1, %xmm4
+; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm2
+; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT: vmovdqa 224(%rdi), %xmm6
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm6, %xmm7
+; AVX512BW-NEXT: vpmovqb %zmm3, %xmm3
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm1
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm1, %xmm8
-; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
-; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm11
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm8
-; AVX512BW-NEXT: vpshufb %xmm5, %xmm8, %xmm5
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3]
-; AVX512BW-NEXT: vpmovqb %zmm16, %xmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512BW-NEXT: vpmovqb %ymm4, %xmm4
+; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT: vmovdqa 160(%rdi), %ymm7
+; AVX512BW-NEXT: vpmovqb %ymm7, %xmm9
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512BW-NEXT: vpmovqb %ymm7, %xmm7
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5],ymm4[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm3
+; AVX512BW-NEXT: vpshufb %xmm5, %xmm3, %xmm7
+; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm4
+; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm7
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm7, %xmm8
+; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
+; AVX512BW-NEXT: vpmovqb %zmm16, %xmm5
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm20
; AVX512BW-NEXT: vmovdqa64 128(%rdi), %xmm19
-; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm11
-; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm12
-; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm13
+; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm9
+; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm10
+; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm11
+; AVX512BW-NEXT: vmovdqa 240(%rdi), %xmm12
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm14
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm12, %xmm5
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm13
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm1, %xmm14
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm2, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm7, %xmm15
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm9, %xmm17
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-NEXT: vpshufb %xmm14, %xmm11, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm14, %xmm10, %xmm14
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: vpshufb %xmm15, %xmm13, %xmm17
-; AVX512BW-NEXT: vpshufb %xmm15, %xmm12, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm9, %xmm17
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm19, %xmm15
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
-; AVX512BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm10
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm4, %xmm10
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm8, %xmm14
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vpsrlq $8, %zmm16, %zmm10
-; AVX512BW-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm14
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm14, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm3, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm4, %xmm13
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
+; AVX512BW-NEXT: vpsrlq $8, %zmm16, %zmm13
+; AVX512BW-NEXT: vpmovqb %zmm13, %xmm13
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm21
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm12, %xmm5
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm13
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm7, %xmm14
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm9, %xmm15
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm1, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm2, %xmm17
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512BW-NEXT: vpshufb %xmm15, %xmm11, %xmm17
-; AVX512BW-NEXT: vpshufb %xmm15, %xmm19, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm10, %xmm15
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm14
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm4, %xmm14
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm8, %xmm10
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vpsrlq $16, %zmm16, %zmm10
-; AVX512BW-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT: vpshufb %xmm17, %xmm9, %xmm18
+; AVX512BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm15[5],ymm8[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm14, %xmm8
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm3, %xmm8
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm4, %xmm13
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512BW-NEXT: vpsrlq $16, %zmm16, %zmm8
+; AVX512BW-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm22
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm12, %xmm5
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm8
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm7, %xmm14
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm9, %xmm15
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: vpshufb %xmm15, %xmm11, %xmm17
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm13
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm2, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm11, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm10, %xmm13
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm9, %xmm17
; AVX512BW-NEXT: vpshufb %xmm15, %xmm19, %xmm15
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm14
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm4, %xmm14
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm8, %xmm10
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vpsrlq $24, %zmm16, %zmm10
-; AVX512BW-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm14, %xmm13
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm3, %xmm13
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512BW-NEXT: vpsrlq $24, %zmm16, %zmm8
+; AVX512BW-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm23
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm12, %xmm5
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm8
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm7, %xmm14
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm9, %xmm17
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm13, %xmm17
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm13
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm2, %xmm17
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm17[0],xmm13[0],xmm17[1],xmm13[1],xmm17[2],xmm13[2],xmm17[3],xmm13[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm11, %xmm17
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm10, %xmm13
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm17[0],xmm13[1],xmm17[1],xmm13[2],xmm17[2],xmm13[3],xmm17[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
+; AVX512BW-NEXT: vpshufb %xmm17, %xmm9, %xmm18
; AVX512BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
; AVX512BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm15
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm14
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm4, %xmm14
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm8, %xmm10
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vpsrlq $32, %zmm16, %zmm10
-; AVX512BW-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm14, %xmm13
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm3, %xmm13
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512BW-NEXT: vpsrlq $32, %zmm16, %zmm8
+; AVX512BW-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm24
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm12, %xmm5
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm8
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm7, %xmm15
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm9, %xmm17
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm2, %xmm17
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: vpshufb %xmm15, %xmm13, %xmm17
-; AVX512BW-NEXT: vpshufb %xmm15, %xmm12, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm11, %xmm17
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm10, %xmm15
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
+; AVX512BW-NEXT: vpshufb %xmm17, %xmm9, %xmm18
; AVX512BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm14
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm14
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm4, %xmm14
-; AVX512BW-NEXT: vpshufb %xmm10, %xmm8, %xmm10
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vpsrlq $40, %zmm16, %zmm10
-; AVX512BW-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm13
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm14, %xmm13
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm3, %xmm13
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512BW-NEXT: vpsrlq $40, %zmm16, %zmm8
+; AVX512BW-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-NEXT: vmovdqa64 %ymm0, %ymm25
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm2, %xmm10
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm14
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm7, %xmm15
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm9, %xmm17
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm12, %xmm8
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm13
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm1, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm2, %xmm17
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5,6],ymm8[7]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: vpshufb %xmm15, %xmm13, %xmm17
-; AVX512BW-NEXT: vpshufb %xmm15, %xmm12, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm11, %xmm17
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm10, %xmm15
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
+; AVX512BW-NEXT: vpshufb %xmm17, %xmm9, %xmm18
; AVX512BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
; AVX512BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm5
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm15[5],ymm5[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm10
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm4, %xmm10
-; AVX512BW-NEXT: vpshufb %xmm14, %xmm8, %xmm14
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vpsrlq $48, %zmm16, %zmm10
-; AVX512BW-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm14, %xmm8
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm3, %xmm8
+; AVX512BW-NEXT: vpshufb %xmm13, %xmm4, %xmm13
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512BW-NEXT: vpsrlq $48, %zmm16, %zmm8
+; AVX512BW-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm5, %xmm12, %xmm8
; AVX512BW-NEXT: vpshufb %xmm5, %xmm6, %xmm6
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm2, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm11, %xmm6
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm10, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: vpshufb %xmm6, %xmm7, %xmm7
; AVX512BW-NEXT: vpshufb %xmm6, %xmm9, %xmm9
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: vpshufb %xmm7, %xmm13, %xmm9
-; AVX512BW-NEXT: vpshufb %xmm7, %xmm12, %xmm7
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: vpshufb %xmm9, %xmm11, %xmm10
-; AVX512BW-NEXT: vpshufb %xmm9, %xmm19, %xmm9
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm6, %xmm8, %xmm4
+; AVX512BW-NEXT: vpshufb %xmm6, %xmm19, %xmm6
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5],ymm6[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-NEXT: vpshufb %xmm5, %xmm14, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm5, %xmm7, %xmm5
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm4, %xmm4
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
; AVX512BW-NEXT: vpsrlq $56, %zmm16, %zmm3
; AVX512BW-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-NEXT: vmovdqa64 %ymm20, (%rsi)
; AVX512BW-NEXT: vmovdqa64 %ymm21, (%rdx)
; AVX512BW-NEXT: vmovdqa64 %ymm22, (%rcx)
@@ -7354,167 +7399,172 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512BW-FCP-LABEL: load_i8_stride8_vf32:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7
+; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
-; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm7
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
-; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
-; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm30
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
-; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm3
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15
-; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm2
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm12
-; AVX512BW-FCP-NEXT: vpmovqd %ymm12, %xmm1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm16
-; AVX512BW-FCP-NEXT: vpmovqd %ymm16, %xmm0
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
-; AVX512BW-FCP-NEXT: vpmovqb %zmm4, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm3, %ymm9
+; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm12
+; AVX512BW-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm8
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,10,10,11,8,10,12,14]
+; AVX512BW-FCP-NEXT: vpermd %zmm5, %zmm3, %zmm30
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
+; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm14
+; AVX512BW-FCP-NEXT: vpermd %ymm14, %ymm0, %ymm6
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpermd %zmm4, %zmm3, %zmm0
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4],ymm3[5],ymm9[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512BW-FCP-NEXT: vpmovqd %ymm10, %xmm1
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,10,12,14,12,14,14,15]
+; AVX512BW-FCP-NEXT: vpermd %zmm7, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %xmm31, %xmm3, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1,2],xmm15[3]
+; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm18, %ymm6, %ymm11
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm2, %ymm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm0, %ymm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm9
+; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm11
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm0, %xmm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm4, %zmm10
-; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm18
+; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm3, %xmm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3]
+; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm2, %zmm13
+; AVX512BW-FCP-NEXT: vpmovqb %zmm13, %xmm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 %ymm9, %ymm17
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm7, %ymm6
+; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm8, %ymm9
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7]
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm3, %ymm10
+; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm6, %ymm13
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm2, %ymm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm0, %ymm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7]
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm1, %xmm10
+; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm1, %xmm13
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm27 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm0, %xmm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3]
-; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm4, %zmm13
-; AVX512BW-FCP-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm3, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3]
+; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm2, %zmm15
+; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm7, %ymm7
+; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm8, %ymm8
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm30, %ymm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm30, %ymm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5,6],ymm13[7]
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm3
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm10
+; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm6, %ymm6
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm13[6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm16 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm1, %xmm13
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3]
-; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm4, %zmm10
-; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
-; AVX512BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm8
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11
-; AVX512BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm13
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3]
+; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm2, %zmm13
+; AVX512BW-FCP-NEXT: vpmovqb %zmm13, %xmm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
+; AVX512BW-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [9,11,10,11,9,11,13,15]
+; AVX512BW-FCP-NEXT: vpermd %zmm5, %zmm15, %zmm5
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7]
+; AVX512BW-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm13
+; AVX512BW-FCP-NEXT: vpermd %zmm4, %zmm15, %zmm14
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [9,11,13,15,13,15,14,15]
+; AVX512BW-FCP-NEXT: vpermd %zmm7, %zmm4, %zmm7
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,5,7,0,0,0,0]
+; AVX512BW-FCP-NEXT: vpermd %ymm10, %ymm4, %ymm10
+; AVX512BW-FCP-NEXT: vpshufb %xmm31, %xmm7, %xmm4
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3]
+; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm2, %zmm11
+; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm18, %ymm13, %ymm11
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm14, %ymm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm7, %xmm11
+; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm10, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm2, %zmm15
+; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm12, %ymm11
+; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm5, %ymm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm15[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm13, %ymm15
+; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm14, %ymm0
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7]
-; AVX512BW-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm12
-; AVX512BW-FCP-NEXT: vpermd %ymm16, %ymm5, %ymm15
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3]
-; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm4, %zmm14
-; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm11, %ymm14
-; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm13, %ymm9
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm12, %xmm9
-; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm15, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
-; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm4, %zmm14
-; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm9
-; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm8, %ymm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm14
-; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm13, %ymm7
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm12, %xmm9
-; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm15, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
-; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm4, %zmm14
-; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm10, %ymm9
-; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm8, %ymm8
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm9
-; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm13, %ymm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2
-; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm4, %zmm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm7, %xmm11
+; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm10, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm2, %zmm15
+; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm12, %ymm11
+; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm5, %ymm5
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm13, %ymm11
+; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm8
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm10, %xmm7
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3]
+; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm2, %zmm2
; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %ymm2, (%rsi)
-; AVX512BW-FCP-NEXT: vmovdqa64 %ymm18, (%rdx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqa64 %ymm17, (%rdx)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%rcx)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%r9)
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rax)
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512BW-FCP-NEXT: vzeroupper
@@ -7526,291 +7576,296 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm16
-; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm0
-; AVX512DQ-BW-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa 240(%rdi), %xmm2
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm2, %xmm1
-; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %xmm6
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm6, %xmm3
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm7
+; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
+; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm1
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm7, %xmm3
-; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm9
-; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm9, %xmm8
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm3
-; AVX512DQ-BW-NEXT: vpmovqb %ymm3, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm1, %xmm4
+; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm2
+; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm2, %xmm6
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT: vmovdqa 224(%rdi), %xmm6
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm6, %xmm7
+; AVX512DQ-BW-NEXT: vpmovqb %zmm3, %xmm3
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT: vmovdqa 112(%rdi), %xmm1
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm1, %xmm8
-; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm3
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3]
-; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm4, %xmm11
-; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm8
-; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm8, %xmm5
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm10[3]
-; AVX512DQ-BW-NEXT: vpmovqb %zmm16, %xmm10
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm4
+; AVX512DQ-BW-NEXT: vpmovqb %ymm4, %xmm4
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %ymm7
+; AVX512DQ-BW-NEXT: vpmovqb %ymm7, %xmm9
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpmovqb %ymm7, %xmm7
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5],ymm4[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2,3,4,5],ymm3[6,7]
+; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm3
+; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm3, %xmm7
+; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm4
+; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm7
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm7, %xmm8
+; AVX512DQ-BW-NEXT: vpmovqb %zmm0, %xmm0
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT: vpmovqb %zmm16, %xmm5
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
; AVX512DQ-BW-NEXT: vmovdqa64 %ymm0, %ymm20
; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %xmm19
-; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm11
-; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm12
-; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm13
+; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm9
+; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm10
+; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm11
+; AVX512DQ-BW-NEXT: vmovdqa 240(%rdi), %xmm12
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm14
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm12, %xmm5
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm13
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm1, %xmm14
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm2, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm7, %xmm15
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm9, %xmm17
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm11, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm10, %xmm14
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm13, %xmm17
-; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm12, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm9, %xmm17
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm19, %xmm15
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
-; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm10
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm1, %xmm10
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm4, %xmm10
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm8, %xmm14
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm16, %zmm10
-; AVX512DQ-BW-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-NEXT: vmovdqa 112(%rdi), %xmm14
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm14, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm3, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm4, %xmm13
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm16, %zmm13
+; AVX512DQ-BW-NEXT: vpmovqb %zmm13, %xmm13
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-BW-NEXT: vmovdqa64 %ymm0, %ymm21
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm12, %xmm5
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm13
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm7, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm9, %xmm15
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm1, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm2, %xmm17
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm11, %xmm17
-; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm19, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm10, %xmm15
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm1, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm4, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm8, %xmm10
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm16, %zmm10
-; AVX512DQ-BW-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm9, %xmm18
+; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm8
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm15[5],ymm8[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm14, %xmm8
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm3, %xmm8
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm4, %xmm13
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm16, %zmm8
+; AVX512DQ-BW-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-BW-NEXT: vmovdqa64 %ymm0, %ymm22
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm12, %xmm5
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm8
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm7, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm9, %xmm15
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm13, %xmm15
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm1, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm2, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm11, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm10, %xmm13
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm11, %xmm17
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm9, %xmm17
; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm19, %xmm15
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm1, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm4, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm8, %xmm10
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm16, %zmm10
-; AVX512DQ-BW-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm14, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm3, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm16, %zmm8
+; AVX512DQ-BW-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-BW-NEXT: vmovdqa64 %ymm0, %ymm23
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm12, %xmm5
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm8
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm7, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm9, %xmm17
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm13, %xmm17
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm12, %xmm14
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm1, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm2, %xmm17
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm17[0],xmm13[0],xmm17[1],xmm13[1],xmm17[2],xmm13[2],xmm17[3],xmm13[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm11, %xmm17
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm10, %xmm13
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm17[0],xmm13[1],xmm17[1],xmm13[2],xmm17[2],xmm13[3],xmm17[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
+; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm9, %xmm18
; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm1, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm4, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm8, %xmm10
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm16, %zmm10
-; AVX512DQ-BW-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm14, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm3, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm16, %zmm8
+; AVX512DQ-BW-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-BW-NEXT: vmovdqa64 %ymm0, %ymm24
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm2, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm10
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm12, %xmm5
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm8
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm10 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm7, %xmm15
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm9, %xmm17
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm1, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm2, %xmm17
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5,6],ymm5[7]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm13, %xmm17
-; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm12, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm11, %xmm17
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm10, %xmm15
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
+; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm9, %xmm18
; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm14
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm1, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm4, %xmm14
-; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm8, %xmm10
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm16, %zmm10
-; AVX512DQ-BW-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm13
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm14, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm3, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm4, %xmm8
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm16, %zmm8
+; AVX512DQ-BW-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-BW-NEXT: vmovdqa64 %ymm0, %ymm25
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm2, %xmm10
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm14
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm7, %xmm15
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm9, %xmm17
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm12, %xmm8
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm13
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm1, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm2, %xmm17
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm17[0],xmm15[0],xmm17[1],xmm15[1],xmm17[2],xmm15[2],xmm17[3],xmm15[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5,6],ymm8[7]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm13, %xmm17
-; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm12, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm11, %xmm17
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm10, %xmm15
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm17 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm11, %xmm18
+; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm9, %xmm18
; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm19, %xmm17
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm5
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm15[5],ymm5[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm1, %xmm10
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm3, %xmm0
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm4, %xmm10
-; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm8, %xmm14
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm16, %zmm10
-; AVX512DQ-BW-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm14, %xmm8
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm3, %xmm8
+; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm4, %xmm13
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm16, %zmm8
+; AVX512DQ-BW-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm5 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm12, %xmm8
; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm6, %xmm6
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm11, %xmm6
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm10, %xmm2
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm6 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm7, %xmm7
; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm9, %xmm9
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm7 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm13, %xmm9
-; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm12, %xmm7
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm9 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm11, %xmm10
-; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm19, %xmm9
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5],ymm9[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm1, %xmm1
-; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm4, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm8, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm19, %xmm6
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5],ymm6[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm14, %xmm2
+; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm7, %xmm5
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm4, %xmm4
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm16, %zmm3
; AVX512DQ-BW-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-BW-NEXT: vmovdqa64 %ymm20, (%rsi)
; AVX512DQ-BW-NEXT: vmovdqa64 %ymm21, (%rdx)
; AVX512DQ-BW-NEXT: vmovdqa64 %ymm22, (%rcx)
@@ -7824,167 +7879,172 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512DQ-BW-FCP-LABEL: load_i8_stride8_vf32:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm7
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm30
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm12, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm16
-; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm16, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm4, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5,6],ymm6[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm3, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm12, %ymm0, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm5, %zmm3, %zmm30
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm14, %ymm0, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm4, %zmm3, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4],ymm3[5],ymm9[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm10, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,10,12,14,12,14,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm7, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm31 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm31, %xmm3, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1,2],xmm15[3]
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm18 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm18, %ymm6, %ymm11
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm2, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm0, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm1, %xmm11
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm0, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm4, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm6, %ymm18
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm3, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm2, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm9, %ymm17
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm7, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm8, %ymm9
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm30, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm13[7]
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm3, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm6, %ymm13
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm2, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5],ymm13[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm0, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7]
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm1, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm1, %xmm13
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm27 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm0, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm4, %zmm13
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm3, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm2, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm7, %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm8, %ymm8
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm30, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm30, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1,2,3,4,5,6],ymm13[7]
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm6, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm13[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm16 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm1, %xmm13
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm4, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm11, %ymm0, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm0, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm2, %zmm13
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm5, %zmm15, %zmm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm14, %ymm3, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm4, %zmm15, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,20,24,28,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [9,11,13,15,13,15,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm7, %zmm4, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,5,7,0,0,0,0]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm10, %ymm4, %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm31, %xmm7, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm2, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm18, %ymm13, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm14, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm7, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm10, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm2, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm12, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm5, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm13, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm14, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,5,7,5,7,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm16, %ymm5, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm15[u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm4, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm14[0,1],xmm5[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm11, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm13, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm12, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm15, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm4, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm10, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm8, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm13, %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5],ymm7[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm12, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm15, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm4, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm10, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm8, %ymm8
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm11, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm13, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm15, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm4, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm7, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm10, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm2, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm12, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm5, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm13, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm14, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm10, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm2, %zmm2
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %ymm2, (%rsi)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm18, (%rdx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %ymm17, (%rdx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%rcx)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%r9)
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rax)
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
@@ -13041,729 +13101,731 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
;
; AVX512-LABEL: load_i8_stride8_vf64:
; AVX512: # %bb.0:
-; AVX512-NEXT: subq $520, %rsp # imm = 0x208
-; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm17
-; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0
-; AVX512-NEXT: vpmovqb %zmm0, %xmm2
-; AVX512-NEXT: vmovdqa 496(%rdi), %xmm7
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm1
-; AVX512-NEXT: vmovdqa 480(%rdi), %xmm8
-; AVX512-NEXT: vpshufb %xmm0, %xmm8, %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512-NEXT: vmovdqa 464(%rdi), %xmm11
+; AVX512-NEXT: subq $408, %rsp # imm = 0x198
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm16
+; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
+; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm30
+; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm4
+; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm5
+; AVX512-NEXT: vmovdqa 464(%rdi), %xmm10
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm4
-; AVX512-NEXT: vmovdqa 448(%rdi), %xmm15
-; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm5
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vmovdqa 384(%rdi), %ymm4
-; AVX512-NEXT: vpmovqb %ymm4, %xmm4
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-NEXT: vmovdqa 368(%rdi), %xmm4
-; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm4, %xmm18
-; AVX512-NEXT: vmovdqa 352(%rdi), %xmm14
-; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm4
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-NEXT: vmovdqa 336(%rdi), %xmm4
-; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm4
-; AVX512-NEXT: vmovdqa 320(%rdi), %xmm6
-; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm5
-; AVX512-NEXT: vmovdqa64 %xmm6, %xmm19
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX512-NEXT: vpmovqb %zmm17, %xmm4
-; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
+; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm2
+; AVX512-NEXT: vmovdqa 448(%rdi), %xmm11
+; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm6
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm6
+; AVX512-NEXT: vmovdqa 480(%rdi), %xmm8
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm7
+; AVX512-NEXT: vmovdqa64 %xmm8, %xmm19
+; AVX512-NEXT: vpmovqb %zmm5, %xmm5
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-NEXT: vmovdqa 384(%rdi), %ymm6
+; AVX512-NEXT: vpmovqb %ymm6, %xmm6
+; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-NEXT: vmovdqa 416(%rdi), %ymm7
+; AVX512-NEXT: vpmovqb %ymm7, %xmm8
+; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512-NEXT: vpmovqb %ymm7, %xmm7
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-NEXT: vmovdqa 336(%rdi), %xmm7
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm6
+; AVX512-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa64 %xmm7, %xmm17
+; AVX512-NEXT: vmovdqa 320(%rdi), %xmm8
+; AVX512-NEXT: vpshufb %xmm1, %xmm8, %xmm7
+; AVX512-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa64 %xmm8, %xmm22
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX512-NEXT: vmovdqa 352(%rdi), %xmm13
+; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm7
+; AVX512-NEXT: vpmovqb %zmm4, %xmm4
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
+; AVX512-NEXT: vpmovqb %zmm30, %xmm6
+; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
; AVX512-NEXT: movb $-64, %al
; AVX512-NEXT: kmovw %eax, %k1
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
-; AVX512-NEXT: vmovdqa 240(%rdi), %xmm4
-; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm2
-; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa64 %xmm4, %xmm25
-; AVX512-NEXT: vmovdqa 224(%rdi), %xmm4
-; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512-NEXT: vmovdqa 208(%rdi), %xmm5
-; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm5, %xmm29
-; AVX512-NEXT: vmovdqa 192(%rdi), %xmm10
-; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm5
-; AVX512-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512-NEXT: vpmovqb %zmm5, %xmm5
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm4
+; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm4 {%k1}
+; AVX512-NEXT: vmovdqa 208(%rdi), %xmm15
+; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm5
+; AVX512-NEXT: vmovdqa 192(%rdi), %xmm6
+; AVX512-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm6
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512-NEXT: vmovdqa 224(%rdi), %xmm12
+; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm6
+; AVX512-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpmovqb %zmm3, %xmm3
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
; AVX512-NEXT: vmovdqa 128(%rdi), %ymm5
; AVX512-NEXT: vpmovqb %ymm5, %xmm5
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm4
-; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm5
-; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm0
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX512-NEXT: vmovdqa 160(%rdi), %ymm6
+; AVX512-NEXT: vpmovqb %ymm6, %xmm7
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
+; AVX512-NEXT: vpmovqb %ymm6, %xmm6
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm5
-; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm5, %xmm30
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm5, %xmm28
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm5
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm6
+; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm6, %xmm20
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX512-NEXT: vmovdqa64 %xmm5, %xmm21
+; AVX512-NEXT: vpmovqb %zmm0, %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm16
; AVX512-NEXT: vpmovqb %zmm16, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512-NEXT: vpshufb %xmm9, %xmm7, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm7, %xmm24
-; AVX512-NEXT: vpshufb %xmm9, %xmm8, %xmm2
-; AVX512-NEXT: vmovdqa64 %xmm8, %xmm21
+; AVX512-NEXT: vmovdqa 496(%rdi), %xmm1
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512-NEXT: vpshufb %xmm8, %xmm1, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm25
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm9
+; AVX512-NEXT: vpshufb %xmm8, %xmm9, %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512-NEXT: vpshufb %xmm8, %xmm11, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm11, %xmm27
-; AVX512-NEXT: vpshufb %xmm8, %xmm15, %xmm4
+; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm10, %xmm27
+; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm4
+; AVX512-NEXT: vmovdqa %xmm11, %xmm10
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-NEXT: vmovdqa 416(%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 432(%rdi), %xmm11
+; AVX512-NEXT: vmovdqa 416(%rdi), %xmm14
+; AVX512-NEXT: vmovdqa 432(%rdi), %xmm5
; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm2
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm5
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm22
+; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm2
+; AVX512-NEXT: vmovdqa64 %xmm5, %xmm23
+; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm5
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
; AVX512-NEXT: vmovdqa 384(%rdi), %xmm0
-; AVX512-NEXT: vmovdqa 400(%rdi), %xmm12
+; AVX512-NEXT: vmovdqa 400(%rdi), %xmm11
; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm6
+; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm6
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm7
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm31
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm26
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vmovdqa64 %xmm18, %xmm0
-; AVX512-NEXT: vpshufb %xmm9, %xmm0, %xmm5
-; AVX512-NEXT: vmovdqa64 %xmm18, %xmm23
-; AVX512-NEXT: vpshufb %xmm9, %xmm14, %xmm6
-; AVX512-NEXT: vmovdqa64 %xmm14, %xmm26
+; AVX512-NEXT: vmovdqa 368(%rdi), %xmm0
+; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm5
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm24
+; AVX512-NEXT: vpshufb %xmm8, %xmm13, %xmm6
+; AVX512-NEXT: vmovdqa64 %xmm13, %xmm28
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm6
-; AVX512-NEXT: vmovdqa64 %xmm19, %xmm14
-; AVX512-NEXT: vpshufb %xmm8, %xmm14, %xmm7
+; AVX512-NEXT: vmovdqa64 %xmm17, %xmm0
+; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm6
+; AVX512-NEXT: vmovdqa64 %xmm22, %xmm0
+; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm7
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512-NEXT: vpsrlq $8, %zmm17, %zmm6
+; AVX512-NEXT: vpsrlq $8, %zmm30, %zmm6
; AVX512-NEXT: vpmovqb %zmm6, %xmm6
; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm19
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm19 {%k1}
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm0
-; AVX512-NEXT: vpshufb %xmm9, %xmm0, %xmm4
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm9, %xmm0, %xmm5
+; AVX512-NEXT: vmovdqa 240(%rdi), %xmm13
+; AVX512-NEXT: vpshufb %xmm8, %xmm13, %xmm4
+; AVX512-NEXT: vpshufb %xmm8, %xmm12, %xmm5
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT: vmovdqa64 %xmm29, %xmm0
-; AVX512-NEXT: vpshufb %xmm8, %xmm0, %xmm5
-; AVX512-NEXT: vpshufb %xmm8, %xmm10, %xmm6
+; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm5
+; AVX512-NEXT: vmovdqa64 %xmm15, %xmm31
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm6
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT: vmovdqa 160(%rdi), %xmm0
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT: vmovdqa 160(%rdi), %xmm12
; AVX512-NEXT: vmovdqa 176(%rdi), %xmm5
-; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm5, %xmm25
-; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm20
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-NEXT: vmovdqa 128(%rdi), %xmm4
-; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm5
+; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm3
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-NEXT: vmovdqa 128(%rdi), %xmm5
; AVX512-NEXT: vmovdqa 144(%rdi), %xmm0
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX512-NEXT: vmovdqa64 %xmm5, %xmm29
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm9, %xmm2, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT: vmovdqa64 %xmm30, %xmm7
-; AVX512-NEXT: vpshufb %xmm8, %xmm7, %xmm2
-; AVX512-NEXT: vmovdqa64 %xmm28, %xmm3
-; AVX512-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm2
+; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpshufb %xmm8, %xmm2, %xmm2
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm7
+; AVX512-NEXT: vpshufb %xmm8, %xmm7, %xmm3
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm20, %xmm6
+; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
; AVX512-NEXT: vpsrlq $8, %zmm16, %zmm2
+; AVX512-NEXT: vmovdqa64 %zmm16, %zmm22
; AVX512-NEXT: vpmovqb %zmm2, %xmm2
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512-NEXT: vmovdqa64 %xmm24, %xmm13
-; AVX512-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm10
-; AVX512-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm2
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm5
+; AVX512-NEXT: vpshufb %xmm4, %xmm5, %xmm1
+; AVX512-NEXT: vpshufb %xmm4, %xmm9, %xmm2
+; AVX512-NEXT: vmovdqa64 %xmm9, %xmm25
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm27, %xmm0
-; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3
-; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm8
-; AVX512-NEXT: vmovdqa64 %xmm15, %xmm18
+; AVX512-NEXT: vmovdqa64 %xmm27, %xmm9
+; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm3
+; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm8
+; AVX512-NEXT: vmovdqa64 %xmm10, %xmm19
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm2
-; AVX512-NEXT: vmovdqa64 %xmm11, %xmm24
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm11
-; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
+; AVX512-NEXT: vmovdqa64 %xmm23, %xmm10
+; AVX512-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm2
+; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm15
+; AVX512-NEXT: vmovdqa64 %xmm14, %xmm20
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3]
; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm12, %xmm30
-; AVX512-NEXT: vmovdqa64 %xmm31, %xmm5
-; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm15
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm11, %xmm17
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm11
+; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm14
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; AVX512-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm14
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7]
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
-; AVX512-NEXT: vmovdqa64 %xmm23, %xmm12
-; AVX512-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm8
-; AVX512-NEXT: vmovdqa64 %xmm26, %xmm5
-; AVX512-NEXT: vpshufb %xmm4, %xmm5, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm9
-; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm15
-; AVX512-NEXT: vmovdqa64 %xmm14, %xmm21
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT: vpsrlq $16, %zmm17, %zmm9
-; AVX512-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1}
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm11
+; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm8
+; AVX512-NEXT: vmovdqa64 %xmm28, %xmm14
+; AVX512-NEXT: vpshufb %xmm4, %xmm14, %xmm14
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm4, %xmm14, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; AVX512-NEXT: vmovdqa64 %xmm29, %xmm6
-; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm9
+; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm14
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm15
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512-NEXT: vmovdqa64 %zmm30, %zmm23
+; AVX512-NEXT: vpsrlq $16, %zmm30, %zmm14
+; AVX512-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
+; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1}
+; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm13, %xmm30
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm14
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; AVX512-NEXT: vmovdqa64 %xmm31, %xmm13
+; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm14
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm15
+; AVX512-NEXT: vmovdqa64 %xmm13, %xmm16
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm9
-; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm15
-; AVX512-NEXT: vpshufb %xmm3, %xmm15, %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm9
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm2, %xmm15, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm14
+; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm12, %xmm24
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm29, %xmm12
+; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm2
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm4, %xmm7, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm27
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm7, %xmm29
-; AVX512-NEXT: vmovdqa64 %xmm28, %xmm7
-; AVX512-NEXT: vmovdqa64 %xmm28, (%rsp) # 16-byte Spill
-; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm6, %xmm21
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; AVX512-NEXT: vpsrlq $16, %zmm16, %zmm2
+; AVX512-NEXT: vpsrlq $16, %zmm22, %zmm2
; AVX512-NEXT: vpmovqb %zmm2, %xmm2
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm1
-; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm2
+; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm5, %xmm18
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm6
+; AVX512-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm27, %xmm10
-; AVX512-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm18, %xmm4
-; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm4
+; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm9, %xmm25
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm9
+; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm4
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm24, %xmm3
-; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm8
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm28
+; AVX512-NEXT: vpshufb %xmm2, %xmm10, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm20, %xmm5
+; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm8
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm30, %xmm9
-; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm31, %xmm11
-; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm15
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT: vmovdqa64 %xmm17, %xmm13
+; AVX512-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm12
+; AVX512-NEXT: vmovdqa64 %xmm26, (%rsp) # 16-byte Spill
+; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm8
-; AVX512-NEXT: vmovdqa64 %xmm26, %xmm13
-; AVX512-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm5
+; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm8
+; AVX512-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa64 %xmm28, %xmm10
+; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm14
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm14
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm15
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT: vpsrlq $24, %zmm17, %zmm9
-; AVX512-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512-NEXT: vpsrlq $24, %zmm23, %zmm14
+; AVX512-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT: vmovdqa64 %xmm30, %xmm4
; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm14, %xmm22
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm6, %xmm26
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm15
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm14
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT: vmovdqa64 %xmm31, %xmm5
+; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm16, %xmm5
+; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm14
-; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm9
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm9
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm5
+; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm29, %xmm5
+; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm3
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4
+; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512-NEXT: vmovdqa64 %xmm29, %xmm3
-; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm7
; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT: vpsrlq $24, %zmm16, %zmm1
+; AVX512-NEXT: vpsrlq $24, %zmm22, %zmm1
; AVX512-NEXT: vpmovqb %zmm1, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-NEXT: vmovdqa64 %xmm18, %xmm1
; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm18, %xmm10
-; AVX512-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm4
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm3
+; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm4
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm28
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm24, %xmm11
-; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm28, %xmm7
-; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm8
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm20, %xmm9
+; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm8
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm30, %xmm7
-; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm31, %xmm12
+; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm14
; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm15
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm8
-; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm9
-; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm15
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT: vpsrlq $32, %zmm17, %zmm9
-; AVX512-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm8
+; AVX512-NEXT: vmovdqa %xmm10, %xmm5
+; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm14
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm14
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512-NEXT: vpsrlq $32, %zmm23, %zmm14
+; AVX512-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm5
-; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512-NEXT: vmovdqa64 %xmm26, %xmm5
-; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm9
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm15
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT: vmovdqa64 %xmm30, %xmm4
+; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm14
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT: vmovdqa64 %xmm31, %xmm12
+; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm16, %xmm13
+; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT: vpshufb %xmm2, %xmm14, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm20
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm9
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm13
+; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm2
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm29, %xmm13
+; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm3
-; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm6, %xmm25
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4
+; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512-NEXT: vmovdqa64 %xmm29, %xmm3
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm17
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT: vpsrlq $32, %zmm16, %zmm1
-; AVX512-NEXT: vmovdqa64 %zmm16, %zmm18
+; AVX512-NEXT: vpsrlq $32, %zmm22, %zmm1
; AVX512-NEXT: vpmovqb %zmm1, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm18, %xmm7
+; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm1
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm3
; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm4
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm4
+; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm24, %xmm27
-; AVX512-NEXT: vmovdqa64 %xmm28, %xmm5
-; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm8
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm8
+; AVX512-NEXT: vmovdqa64 %xmm20, %xmm31
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm30, %xmm10
-; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm9
-; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm15
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm14
+; AVX512-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm8
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm11
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm0, %xmm8, %xmm8
+; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm5, %xmm29
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm10, %xmm18
; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm15
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT: vpsrlq $40, %zmm17, %zmm9
-; AVX512-NEXT: vmovdqa64 %zmm17, %zmm23
-; AVX512-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512-NEXT: vmovdqa64 %xmm11, %xmm21
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512-NEXT: vpsrlq $40, %zmm23, %zmm14
+; AVX512-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm13, %xmm30
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm6
-; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512-NEXT: vmovdqa64 %xmm26, %xmm13
-; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm9
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm15
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT: vmovdqa64 %xmm30, %xmm11
+; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm4
+; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm6, %xmm30
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm12, %xmm26
+; AVX512-NEXT: vmovdqa64 %xmm16, %xmm5
+; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm9
-; AVX512-NEXT: vpshufb %xmm2, %xmm9, %xmm9
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm12
; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm3, %xmm15, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm15, %xmm28
-; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm14, %xmm16
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm14
+; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm13, %xmm20
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm7, %xmm24
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4
-; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm27, %xmm10
+; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512-NEXT: vmovdqa64 %xmm29, %xmm7
-; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm3
-; AVX512-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm17, %xmm4
; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm1
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT: vpsrlq $40, %zmm18, %zmm1
-; AVX512-NEXT: vmovdqa64 %zmm18, %zmm26
+; AVX512-NEXT: vpsrlq $40, %zmm22, %zmm1
; AVX512-NEXT: vpmovqb %zmm1, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm27
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm29
+; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm7, %xmm19
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm7
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm28, %xmm4
; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm8
-; AVX512-NEXT: vmovdqa64 %xmm5, %xmm21
+; AVX512-NEXT: vmovdqa64 %xmm31, %xmm8
+; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm8
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vpshufb %xmm3, %xmm10, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm10, %xmm18
-; AVX512-NEXT: vmovdqa64 %xmm31, %xmm5
-; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm15
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm14
+; AVX512-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm3, %xmm15, %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm8
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm9, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm9
-; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm15
-; AVX512-NEXT: vmovdqa64 %xmm11, %xmm22
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT: vpsrlq $48, %zmm17, %zmm9
-; AVX512-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm0, %xmm8, %xmm8
+; AVX512-NEXT: vmovdqa64 %xmm29, %xmm14
+; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm14
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512-NEXT: vmovdqa64 %xmm18, %xmm14
+; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm15
+; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512-NEXT: vpsrlq $48, %zmm23, %zmm14
+; AVX512-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512-NEXT: vmovdqa64 %xmm30, %xmm4
-; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm9, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm13, %xmm19
-; AVX512-NEXT: vpshufb %xmm1, %xmm6, %xmm15
-; AVX512-NEXT: vmovdqa64 %xmm6, %xmm17
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm4
+; AVX512-NEXT: vmovdqa64 %xmm11, %xmm25
+; AVX512-NEXT: vmovdqa64 %xmm30, %xmm11
+; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm14
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm14
+; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm14
+; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm6
-; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm9
+; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm6, %xmm30
; AVX512-NEXT: vpshufb %xmm2, %xmm12, %xmm2
-; AVX512-NEXT: vmovdqa64 %xmm12, %xmm20
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512-NEXT: vmovdqa64 %xmm28, %xmm14
-; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm16, %xmm13
-; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm6
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm14
+; AVX512-NEXT: vmovdqa64 %xmm9, %xmm24
+; AVX512-NEXT: vmovdqa64 %xmm20, %xmm5
+; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vmovdqa64 %xmm24, %xmm15
-; AVX512-NEXT: vpshufb %xmm0, %xmm15, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm12
-; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm13, %xmm20
+; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm7, %xmm16
-; AVX512-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm17, %xmm9
+; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm1
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT: vpsrlq $48, %zmm26, %zmm1
+; AVX512-NEXT: vpsrlq $48, %zmm22, %zmm1
; AVX512-NEXT: vpmovqb %zmm1, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm24
+; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm17
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512-NEXT: vmovdqa64 %xmm29, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm1
; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm28, %xmm4
; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512-NEXT: vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm8
-; AVX512-NEXT: vpshufb %xmm2, %xmm8, %xmm8
+; AVX512-NEXT: vmovdqa64 %xmm31, %xmm7
+; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm8
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
; AVX512-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512-NEXT: vmovdqa64 %xmm18, %xmm9
-; AVX512-NEXT: vpshufb %xmm3, %xmm9, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm31, %xmm11
-; AVX512-NEXT: vpshufb %xmm3, %xmm11, %xmm11
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm13
+; AVX512-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm14
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4],ymm8[5],ymm13[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm8
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm5
-; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm11
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512-NEXT: vpsrlq $56, %zmm23, %zmm9
-; AVX512-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm8
+; AVX512-NEXT: vmovdqa64 %xmm29, %xmm7
+; AVX512-NEXT: vpshufb %xmm0, %xmm7, %xmm12
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3]
+; AVX512-NEXT: vmovdqa64 %xmm18, %xmm7
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm12
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm7
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm13
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3]
+; AVX512-NEXT: vpsrlq $56, %zmm23, %zmm12
+; AVX512-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
; AVX512-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512-NEXT: vmovdqa64 %xmm30, %xmm4
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm4
; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm9
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512-NEXT: vmovdqa64 %xmm19, %xmm5
-; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm9
-; AVX512-NEXT: vmovdqa64 %xmm17, %xmm5
-; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm11
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm11
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm7
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm11
+; AVX512-NEXT: vmovdqa64 %xmm16, %xmm7
+; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm12
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm6
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm5
-; AVX512-NEXT: vpshufb %xmm2, %xmm5, %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm5
-; AVX512-NEXT: vpshufb %xmm3, %xmm13, %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-NEXT: vmovdqa64 %xmm30, %xmm7
+; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm2
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm6
+; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm6
+; AVX512-NEXT: vpshufb %xmm3, %xmm5, %xmm3
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-NEXT: vpshufb %xmm0, %xmm15, %xmm3
-; AVX512-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm20, %xmm3
+; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm3
+; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512-NEXT: vmovdqa64 %xmm16, %xmm3
-; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm3
+; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm1
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512-NEXT: vpsrlq $56, %zmm26, %zmm1
+; AVX512-NEXT: vpsrlq $56, %zmm22, %zmm1
; AVX512-NEXT: vpmovqb %zmm1, %xmm1
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
@@ -13779,1168 +13841,1179 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512-NEXT: vmovaps %zmm1, (%r9)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vmovaps %zmm1, (%rax)
+; AVX512-NEXT: vmovdqa64 %zmm27, (%rax)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovdqa64 %zmm24, (%rax)
+; AVX512-NEXT: vmovdqa64 %zmm17, (%rax)
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512-NEXT: addq $520, %rsp # imm = 0x208
+; AVX512-NEXT: addq $408, %rsp # imm = 0x198
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: load_i8_stride8_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $232, %rsp
-; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: subq $584, %rsp # imm = 0x248
+; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8
+; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13
+; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12
+; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm30
+; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm31
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
-; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm5
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %ymm22
+; AVX512-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm21
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,10,10,11,8,10,12,14]
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm2, %zmm5
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm15
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm2
-; AVX512-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm3
-; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm19
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm2
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpmovqd %ymm2, %xmm4
+; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %ymm23
+; AVX512-FCP-NEXT: vpermd %ymm23, %ymm0, %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm19
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpermd %zmm30, %zmm2, %zmm4
+; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm29
+; AVX512-FCP-NEXT: vpmovqd %ymm29, %xmm5
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm17
-; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm3
-; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpmovqd %ymm3, %xmm12
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3
-; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm21
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT: vpmovqb %zmm29, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm3
+; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm17
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [8,10,12,14,12,14,14,15]
+; AVX512-FCP-NEXT: vpermd %zmm12, %zmm20, %zmm4
+; AVX512-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
+; AVX512-FCP-NEXT: vpmovqb %zmm28, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm12
; AVX512-FCP-NEXT: movb $-64, %al
; AVX512-FCP-NEXT: kmovw %eax, %k1
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
-; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm1
-; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16
-; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31
-; AVX512-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm20
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm25
-; AVX512-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm10
+; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24
+; AVX512-FCP-NEXT: vpermd %ymm24, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm16
+; AVX512-FCP-NEXT: vpermd %zmm13, %zmm2, %zmm3
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm14
-; AVX512-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm9
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm0
+; AVX512-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm18
+; AVX512-FCP-NEXT: vpermd %zmm8, %zmm2, %zmm2
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm14
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27
-; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm28
-; AVX512-FCP-NEXT: vpmovqd %ymm27, %xmm8
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm15
-; AVX512-FCP-NEXT: vpmovqd %ymm28, %xmm1
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm7
-; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm18
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3]
-; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm30
-; AVX512-FCP-NEXT: vpmovqb %zmm30, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
-; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm7
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm12
-; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm23
-; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm24
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm27
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25
+; AVX512-FCP-NEXT: vpmovqd %ymm25, %xmm13
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm10
+; AVX512-FCP-NEXT: vpermd %zmm27, %zmm20, %zmm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3]
+; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm20
+; AVX512-FCP-NEXT: vpmovqb %zmm20, %xmm10
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm6
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm11
+; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm10
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm8
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm10
+; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm19
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm12
-; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm22
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm5
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm4
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512-FCP-NEXT: vpsrlq $8, %zmm29, %zmm14
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm5
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm10
+; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm17
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3]
+; AVX512-FCP-NEXT: vpsrlq $8, %zmm28, %zmm12
+; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm10 {%k1}
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm6
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm12
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm12
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm13, %xmm12
+; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm17
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3]
+; AVX512-FCP-NEXT: vpsrlq $8, %zmm20, %zmm14
; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm14
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm14
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6
+; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm6
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm19
; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm16
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm15
-; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm17
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14
-; AVX512-FCP-NEXT: vmovdqa %xmm8, %xmm11
-; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm10
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512-FCP-NEXT: vpsrlq $8, %zmm30, %zmm15
-; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
-; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm18
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm10
+; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm12
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm12
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm22
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm12
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm29, %zmm14
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm8, %ymm10
+; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm21
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm10
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm12
+; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm28, %zmm12
+; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm10 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm6
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm16
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm12
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm12
+; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm15
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm12
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3]
+; AVX512-FCP-NEXT: vpsrlq $16, %zmm20, %zmm14
; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1}
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm7
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm13
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm14
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm14
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm14
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512-FCP-NEXT: vpsrlq $16, %zmm30, %zmm15
-; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm21
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm18
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm6
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm8
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm7
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm6
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm8
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
-; AVX512-FCP-NEXT: vpsrlq $24, %zmm29, %zmm8
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm7
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
+; AVX512-FCP-NEXT: vpsrlq $24, %zmm28, %zmm8
; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm6 {%k1}
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm5
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm3
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm1
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; AVX512-FCP-NEXT: vpsrlq $24, %zmm30, %zmm1
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm0
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512-FCP-NEXT: vpsrlq $24, %zmm20, %zmm1
; AVX512-FCP-NEXT: vpmovqb %zmm1, %xmm1
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm18
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm16
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [9,11,10,11,9,11,13,15]
+; AVX512-FCP-NEXT: vpermd %ymm22, %ymm6, %ymm15
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpermd (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm19
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm22
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7]
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm2
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm2
+; AVX512-FCP-NEXT: vpermd %zmm31, %zmm1, %zmm4
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT: vpermd %ymm23, %ymm6, %ymm4
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm21
+; AVX512-FCP-NEXT: vpermd %zmm30, %zmm1, %zmm7
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [9,11,13,15,13,15,14,15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [1,3,5,7,5,7,6,7]
+; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpermd %ymm29, %ymm23, %ymm10
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm29, %zmm3
-; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm17
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1}
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
-; AVX512-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm4
+; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm22
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm28, %zmm4
+; AVX512-FCP-NEXT: vpmovqb %zmm4, %xmm4
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm19
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 {%k1}
+; AVX512-FCP-NEXT: vpermd %ymm24, %ymm6, %ymm4
; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm24
-; AVX512-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpermd %ymm27, %ymm16, %ymm13
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm0
-; AVX512-FCP-NEXT: vpermd %ymm28, %ymm16, %ymm12
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
-; AVX512-FCP-NEXT: vpsrlq $32, %zmm30, %zmm15
-; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm0
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm5
+; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512-FCP-NEXT: vpermd %ymm26, %ymm6, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm6
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm24
+; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm1
+; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpermd %zmm27, %zmm17, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpermd %ymm25, %ymm23, %ymm1
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm23
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3]
+; AVX512-FCP-NEXT: vpsrlq $32, %zmm20, %zmm14
+; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm0
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm6
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm25
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm10
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm15
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm6
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT: vpsrlq $40, %zmm29, %zmm15
-; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3]
+; AVX512-FCP-NEXT: vpsrlq $40, %zmm28, %zmm14
+; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm0
; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm11
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7]
; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm11
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm14
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm23
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm11
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm14
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512-FCP-NEXT: vpsrlq $40, %zmm30, %zmm15
-; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm19
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm26
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm2
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm24
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm14
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
-; AVX512-FCP-NEXT: vpsrlq $48, %zmm29, %zmm14
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm14
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm8
+; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm12
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm8
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
+; AVX512-FCP-NEXT: vpsrlq $40, %zmm20, %zmm14
; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
-; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm14
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm14
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm9
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm21
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm15, %ymm25
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm6
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm6
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm24
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm6
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm26
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3]
+; AVX512-FCP-NEXT: vpsrlq $48, %zmm28, %zmm12
+; AVX512-FCP-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
+; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm12
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm12[7]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm12
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm8
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm12
; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm9
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm14
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512-FCP-NEXT: vpsrlq $48, %zmm30, %zmm15
-; AVX512-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
+; AVX512-FCP-NEXT: vpsrlq $48, %zmm20, %zmm14
+; AVX512-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm17
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm6
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7]
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm10
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm9
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm5
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm10
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm5
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm9
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm8
; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm7
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
-; AVX512-FCP-NEXT: vpsrlq $56, %zmm29, %zmm8
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
+; AVX512-FCP-NEXT: vpsrlq $56, %zmm28, %zmm8
; AVX512-FCP-NEXT: vpmovqb %zmm8, %xmm8
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm7 {%k1}
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm4
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
-; AVX512-FCP-NEXT: vpsrlq $56, %zmm30, %zmm3
-; AVX512-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1}
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm6
+; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm3
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512-FCP-NEXT: vpsrlq $56, %zmm20, %zmm2
+; AVX512-FCP-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm2, (%rsi)
; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%r8)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm18, (%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%r8)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%r9)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rax)
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512-FCP-NEXT: addq $232, %rsp
+; AVX512-FCP-NEXT: addq $584, %rsp # imm = 0x248
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: load_i8_stride8_vf64:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: subq $520, %rsp # imm = 0x208
-; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm17
-; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0
-; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm2
-; AVX512DQ-NEXT: vmovdqa 496(%rdi), %xmm7
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm1
-; AVX512DQ-NEXT: vmovdqa 480(%rdi), %xmm8
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm8, %xmm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm11
+; AVX512DQ-NEXT: subq $408, %rsp # imm = 0x198
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm16
+; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm0
+; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3
+; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm30
+; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm4
+; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm5
+; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm10
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm4
-; AVX512DQ-NEXT: vmovdqa 448(%rdi), %xmm15
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm5
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm4
-; AVX512DQ-NEXT: vpmovqb %ymm4, %xmm4
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-NEXT: vmovdqa 368(%rdi), %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm18
-; AVX512DQ-NEXT: vmovdqa 352(%rdi), %xmm14
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm14, %xmm4
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-NEXT: vmovdqa 336(%rdi), %xmm4
-; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm4
-; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm6
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm5
-; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm19
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
-; AVX512DQ-NEXT: vpmovqb %zmm17, %xmm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm2
+; AVX512DQ-NEXT: vmovdqa 448(%rdi), %xmm11
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm6
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm6
+; AVX512DQ-NEXT: vmovdqa 480(%rdi), %xmm8
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm7
+; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm19
+; AVX512DQ-NEXT: vpmovqb %zmm5, %xmm5
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm6
+; AVX512DQ-NEXT: vpmovqb %ymm6, %xmm6
+; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm7
+; AVX512DQ-NEXT: vpmovqb %ymm7, %xmm8
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512DQ-NEXT: vpmovqb %ymm7, %xmm7
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-NEXT: vmovdqa 336(%rdi), %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm6
+; AVX512DQ-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm17
+; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm8
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm8, %xmm7
+; AVX512DQ-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm22
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX512DQ-NEXT: vmovdqa 352(%rdi), %xmm13
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm13, %xmm7
+; AVX512DQ-NEXT: vpmovqb %zmm4, %xmm4
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3]
+; AVX512DQ-NEXT: vpmovqb %zmm30, %xmm6
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3]
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
; AVX512DQ-NEXT: movb $-64, %al
; AVX512DQ-NEXT: kmovw %eax, %k1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
-; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm2
-; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm25
-; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm4
-; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm29
-; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm10
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm5
-; AVX512DQ-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5
-; AVX512DQ-NEXT: vpmovqb %zmm5, %xmm5
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm4
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm4 {%k1}
+; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm15
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm5
+; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm6
+; AVX512DQ-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm12
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm12, %xmm6
+; AVX512DQ-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpmovqb %zmm3, %xmm3
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm5
; AVX512DQ-NEXT: vpmovqb %ymm5, %xmm5
; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm4
-; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5
-; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm0
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm6
+; AVX512DQ-NEXT: vpmovqb %ymm6, %xmm7
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
+; AVX512DQ-NEXT: vpmovqb %ymm6, %xmm6
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm30
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm28
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm5
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm20
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm21
+; AVX512DQ-NEXT: vpmovqb %zmm0, %xmm0
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm16
; AVX512DQ-NEXT: vpmovqb %zmm16, %xmm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm9 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm7, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm24
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm8, %xmm2
-; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm21
+; AVX512DQ-NEXT: vmovdqa 496(%rdi), %xmm1
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm8 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm1, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm25
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm9, %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm8 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm11, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm27
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm15, %xmm4
+; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm10, %xmm27
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm4
+; AVX512DQ-NEXT: vmovdqa %xmm11, %xmm10
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-NEXT: vmovdqa 416(%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa 432(%rdi), %xmm11
+; AVX512DQ-NEXT: vmovdqa 416(%rdi), %xmm14
+; AVX512DQ-NEXT: vmovdqa 432(%rdi), %xmm5
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm11, %xmm2
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm5
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm22
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm23
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm5
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
; AVX512DQ-NEXT: vmovdqa 384(%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa 400(%rdi), %xmm12
+; AVX512DQ-NEXT: vmovdqa 400(%rdi), %xmm11
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm12, %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm6
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm7
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm31
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm26
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm0, %xmm5
-; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm23
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm14, %xmm6
-; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm26
+; AVX512DQ-NEXT: vmovdqa 368(%rdi), %xmm0
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm0, %xmm5
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm13, %xmm6
+; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm28
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm0, %xmm6
-; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm14
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm14, %xmm7
+; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm0
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm6
+; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm7
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512DQ-NEXT: vpsrlq $8, %zmm17, %zmm6
+; AVX512DQ-NEXT: vpsrlq $8, %zmm30, %zmm6
; AVX512DQ-NEXT: vpmovqb %zmm6, %xmm6
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm19
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm19 {%k1}
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm0, %xmm4
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm0, %xmm5
+; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm13
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm13, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm12, %xmm5
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm0, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm10, %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm5
+; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm31
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm6
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm0
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm12
; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm25
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm20
-; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm4
-; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm12, %xmm3
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm5
; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm0
; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm4, %xmm2
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm29
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm9, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm7
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm7, %xmm2
-; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm2
+; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm2, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm7, %xmm3
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
; AVX512DQ-NEXT: vpsrlq $8, %zmm16, %zmm2
+; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm22
; AVX512DQ-NEXT: vpmovqb %zmm2, %xmm2
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm13
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm13, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm10
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm10, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm5, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm9, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm25
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm8
-; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm18
+; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm8
+; AVX512DQ-NEXT: vmovdqa64 %xmm10, %xmm19
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm11, %xmm2
-; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm24
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm11
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm11, %xmm9
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm10
+; AVX512DQ-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm10, %xmm2
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm15
+; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm20
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm12, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm30
-; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm15
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm17
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm11
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm14
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm14
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm9[5],ymm0[6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm12
-; AVX512DQ-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm12, %xmm8
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm5, %xmm9
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm9
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm14, %xmm15
-; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm21
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT: vpsrlq $16, %zmm17, %zmm9
-; AVX512DQ-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
-; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1}
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm11
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm11, %xmm8
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm14, %xmm14
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm14, %xmm9
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm6
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm14, %xmm14
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm15
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm23
+; AVX512DQ-NEXT: vpsrlq $16, %zmm30, %zmm14
+; AVX512DQ-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
+; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm8 {%k1}
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm13, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm30
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm13, %xmm14
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm13
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm13, %xmm14
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm13, %xmm15
+; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm16
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm9
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm15
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm15, %xmm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm9
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm15, %xmm2
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm13, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm12, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm24
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm12, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm12
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm12, %xmm2
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512DQ-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm7, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm27
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm29
-; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm7
-; AVX512DQ-NEXT: vmovdqa64 %xmm28, (%rsp) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm21
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
-; AVX512DQ-NEXT: vpsrlq $16, %zmm16, %zmm2
+; AVX512DQ-NEXT: vpsrlq $16, %zmm22, %zmm2
; AVX512DQ-NEXT: vpmovqb %zmm2, %xmm2
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm1
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm2
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm18
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm6
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm10
-; AVX512DQ-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm25
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm4
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm8
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm28
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm10, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm8
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm9
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm11
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm11, %xmm15
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm13
+; AVX512DQ-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm13, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm12
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, (%rsp) # 16-byte Spill
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm12, %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm8
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm13
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm9
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm11, %xmm8
+; AVX512DQ-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm10
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm14
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm14
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm15
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT: vpsrlq $24, %zmm17, %zmm9
-; AVX512DQ-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512DQ-NEXT: vpsrlq $24, %zmm23, %zmm14
+; AVX512DQ-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm4
; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm14, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm22
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm26
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm15
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm14
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm14
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm14, %xmm9
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm2
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm9
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm2
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm3
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm3
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm7
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm1
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT: vpsrlq $24, %zmm16, %zmm1
+; AVX512DQ-NEXT: vpsrlq $24, %zmm22, %zmm1
; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm1
; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm10
-; AVX512DQ-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm28
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm11
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm7
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm8
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm3, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm8
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm7
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm12
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm13, %xmm14
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm12, %xmm15
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm8
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm9
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm9
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm15
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT: vpsrlq $32, %zmm17, %zmm9
-; AVX512DQ-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm11, %xmm8
+; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm14
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm14
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512DQ-NEXT: vpsrlq $32, %zmm23, %zmm14
+; AVX512DQ-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm9
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm9
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm15
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm14
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm12
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm12, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm13
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm13, %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm14, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm20
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm2
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm9
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm13, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm13
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm13, %xmm2
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm13, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm13
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm13, %xmm3
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm25
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm3
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512DQ-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm17
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT: vpsrlq $32, %zmm16, %zmm1
-; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm18
+; AVX512DQ-NEXT: vpsrlq $32, %zmm22, %zmm1
; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm1
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm3
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm27
-; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm8
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm8
+; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm31
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm10
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm10, %xmm9
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm12, %xmm15
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm14
+; AVX512DQ-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm8
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm9
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm11
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm8, %xmm8
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm29
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm10, %xmm18
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm15
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT: vpsrlq $40, %zmm17, %zmm9
-; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm23
-; AVX512DQ-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm21
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512DQ-NEXT: vpsrlq $40, %zmm23, %zmm14
+; AVX512DQ-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm30
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm6
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm9
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm13
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm13, %xmm9
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm15
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm11
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm11, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm30
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm12, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm26
+; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm9
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm9, %xmm9
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm12
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm12, %xmm2
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm15, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm28
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm16
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm13, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm20
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm24
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm0
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm10
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm7
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm3
-; AVX512DQ-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm4
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm1
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT: vpsrlq $40, %zmm18, %zmm1
-; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm26
+; AVX512DQ-NEXT: vpsrlq $40, %zmm22, %zmm1
; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm0
-; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm27
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm29
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm19
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm4
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm3
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm8
-; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm21
+; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm8
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm8
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm10, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm10, %xmm18
-; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm15
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm14
+; AVX512DQ-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm15, %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3,4],ymm8[5],ymm14[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm8
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm9
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm9
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm15
-; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm22
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT: vpsrlq $48, %zmm17, %zmm9
-; AVX512DQ-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm8, %xmm8
+; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm14, %xmm14
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm14, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm15
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1,2],xmm8[3]
+; AVX512DQ-NEXT: vpsrlq $48, %zmm23, %zmm14
+; AVX512DQ-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm9, %xmm9
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm13, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm19
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm6, %xmm15
-; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm17
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm11, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm25
+; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm11
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm11, %xmm14
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm14[0],xmm4[0],xmm14[1],xmm4[1],xmm14[2],xmm4[2],xmm14[3],xmm4[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm14, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm6
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm9
+; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm30
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm12, %xmm2
-; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm20
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm14
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm13
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm13, %xmm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm6
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm24
+; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm5
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm3
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm15
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm15, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm12
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm13, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm20
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm16
-; AVX512DQ-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm9
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm1
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT: vpsrlq $48, %zmm26, %zmm1
+; AVX512DQ-NEXT: vpsrlq $48, %zmm22, %zmm1
; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm24
+; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm8, %zmm17
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm0 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm1
; AVX512DQ-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm4
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm4
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm3
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm8
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm8, %xmm8
+; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm8
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
; AVX512DQ-NEXT: vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm9
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm9, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm11
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm11, %xmm11
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm13
+; AVX512DQ-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm7, %xmm14
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4],ymm8[5],ymm13[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm8
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm9
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm11
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3]
-; AVX512DQ-NEXT: vpsrlq $56, %zmm23, %zmm9
-; AVX512DQ-NEXT: vpmovqb %zmm9, %xmm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm8
+; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm7, %xmm12
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm12
+; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm13
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3]
+; AVX512DQ-NEXT: vpsrlq $56, %zmm23, %zmm12
+; AVX512DQ-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
; AVX512DQ-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm8 {%k1}
-; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4
; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm9
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm11
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm11, %xmm11
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm11
+; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm12
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,6],ymm4[7]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm6
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm5, %xmm2
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm14, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm3, %xmm13, %xmm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm2
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm5, %xmm3
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm15, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm7, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm3
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm1
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512DQ-NEXT: vpsrlq $56, %zmm26, %zmm1
+; AVX512DQ-NEXT: vpsrlq $56, %zmm22, %zmm1
; AVX512DQ-NEXT: vpmovqb %zmm1, %xmm1
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
@@ -14956,1095 +15029,1107 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-NEXT: vmovaps %zmm1, (%r9)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT: vmovaps %zmm1, (%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%rax)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rax)
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512DQ-NEXT: addq $520, %rsp # imm = 0x208
+; AVX512DQ-NEXT: addq $408, %rsp # imm = 0x198
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: load_i8_stride8_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $232, %rsp
-; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: subq $584, %rsp # imm = 0x248
+; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm8
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm13
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm28
+; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm30
+; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm31
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %ymm22
+; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm21
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm2, %zmm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm15
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm19
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpmovqd %ymm2, %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %ymm23
+; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm0, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm19
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm2, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm29
+; AVX512DQ-FCP-NEXT: vpmovqd %ymm29, %xmm5
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm17
-; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpmovqd %ymm3, %xmm12
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm12, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm21
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm29, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm17
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [8,10,12,14,12,14,14,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm20, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm28, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm12
; AVX512DQ-FCP-NEXT: movb $-64, %al
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16
-; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm31
-; AVX512DQ-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm20
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm25
-; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24
+; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm16
+; AVX512DQ-FCP-NEXT: vpermd %zmm13, %zmm2, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm10, %ymm14
-; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm18
+; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm14
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm27
-; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm28
-; AVX512DQ-FCP-NEXT: vpmovqd %ymm27, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm15
-; AVX512DQ-FCP-NEXT: vpmovqd %ymm28, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm18
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm15[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm30
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm30, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm23
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm24
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm27
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25
+; AVX512DQ-FCP-NEXT: vpmovqd %ymm25, %xmm13
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm10
+; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm20, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm20
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm20, %xmm10
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm12, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm6
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm10
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm19
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm22
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm12
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm4, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm29, %zmm14
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm17
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm7 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm28, %zmm12
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm10 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm12
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm13, %xmm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm13, %xmm17
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm20, %zmm14
; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm14
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm14
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm19
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm16
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm17
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm8, %xmm14
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, %xmm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm10
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm10, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $8, %zmm30, %zmm15
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm18
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm6, %ymm12
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm22
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm5 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm29, %zmm14
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm8, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm21
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm10
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm12
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1,2],xmm12[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm28, %zmm12
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm10 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm16
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm15
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm12
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm13, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm20, %zmm14
; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm23
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm11, %xmm14
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm10, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm30, %zmm15
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm14[0,1,2,3],ymm7[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm12, %zmm21
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm10, %zmm18
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm6
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm7
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm6
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm29, %zmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm7
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm5 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm28, %zmm8
; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm6 {%k1}
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm11, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm30, %zmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm13, %xmm0
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $24, %zmm20, %zmm1
; AVX512DQ-FCP-NEXT: vpmovqb %zmm1, %xmm1
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm18
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm16
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm22, %ymm6, %ymm15
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpermd (%rsp), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm19
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm22
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7]
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm8 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm2
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm2
+; AVX512DQ-FCP-NEXT: vpermd %zmm31, %zmm1, %zmm4
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm11
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm6, %ymm4
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm21
+; AVX512DQ-FCP-NEXT: vpermd %zmm30, %zmm1, %zmm7
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [9,11,13,15,13,15,14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [1,3,5,7,5,7,6,7]
+; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm12 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpermd %ymm29, %ymm23, %ymm10
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm7, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm29, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm17
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm17 {%k1}
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
-; AVX512DQ-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm10, %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm22
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm28, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm19
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm19 {%k1}
+; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm6, %ymm4
; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm24
-; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm6, %ymm0
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm16, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm16, %ymm12
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm12, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm30, %zmm15
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm16
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm5
+; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm6, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm24
+; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpermd %zmm27, %zmm17, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpermd %ymm25, %ymm23, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm23
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm20, %zmm14
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm19
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm6
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm25
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm10
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm14[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm17 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm7, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm29, %zmm15
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm28, %zmm14
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm14[0,1],xmm6[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm22
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm11
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm6, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm23
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm11
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm13, %xmm14
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm30, %zmm15
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm19
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm26
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm24
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm14
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm1, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3,4],ymm2[5],ymm14[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm2
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm29, %zmm14
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm14
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm13, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $40, %zmm20, %zmm14
; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3]
-; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm14
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm9
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm21
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm15, %ymm25
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm6
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm24
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm17 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm26
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm28, %zmm12
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
+; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm12
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm12
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm9
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm14
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm30, %zmm15
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm17
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm20, %zmm14
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm17
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm6
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm11
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm10
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm9
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm5, %ymm10
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm5, %ymm9
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm7, %ymm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm0 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm9 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm8
; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} xmm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm29, %zmm8
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm28, %zmm8
; AVX512DQ-FCP-NEXT: vpmovqb %zmm8, %xmm8
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm7, %zmm0, %zmm7
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm7 {%k1}
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm1, %ymm4
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm13, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm1
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm7 {%k1}
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512DQ-FCP-NEXT: vpsrlq $56, %zmm20, %zmm2
+; AVX512DQ-FCP-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
+; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm7, %zmm1
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi)
; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%r8)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, (%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%r8)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%r9)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rax)
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512DQ-FCP-NEXT: addq $232, %rsp
+; AVX512DQ-FCP-NEXT: addq $584, %rsp # imm = 0x248
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: load_i8_stride8_vf64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: subq $744, %rsp # imm = 0x2E8
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm0
-; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1
-; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2
-; AVX512BW-NEXT: vpmovqb %zmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa64 496(%rdi), %xmm24
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm24, %xmm3
-; AVX512BW-NEXT: vmovdqa64 480(%rdi), %xmm25
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm25, %xmm4
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vmovdqa64 464(%rdi), %xmm26
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm26, %xmm4
-; AVX512BW-NEXT: vmovdqa64 448(%rdi), %xmm30
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm30, %xmm6
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm4
-; AVX512BW-NEXT: vpmovqb %ymm4, %xmm4
+; AVX512BW-NEXT: subq $760, %rsp # imm = 0x2F8
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3
+; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19
+; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm9
+; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm29
+; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm6
+; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm4
+; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm2
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT: vpshufb %xmm14, %xmm2, %xmm5
+; AVX512BW-NEXT: vmovdqa 448(%rdi), %xmm1
+; AVX512BW-NEXT: vpshufb %xmm14, %xmm1, %xmm7
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-NEXT: vmovdqa 480(%rdi), %xmm10
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm10, %xmm7
+; AVX512BW-NEXT: vpmovqb %zmm4, %xmm4
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512BW-NEXT: vmovdqa64 368(%rdi), %xmm31
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm31, %xmm4
-; AVX512BW-NEXT: vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vmovdqa64 352(%rdi), %xmm27
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm27, %xmm6
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512BW-NEXT: vmovdqa64 336(%rdi), %xmm22
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm22, %xmm11
-; AVX512BW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vmovdqa 320(%rdi), %xmm9
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm9, %xmm15
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512BW-NEXT: vpmovqb %zmm1, %xmm11
-; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm20
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512BW-NEXT: vmovdqa 384(%rdi), %ymm5
+; AVX512BW-NEXT: vpmovqb %ymm5, %xmm5
+; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512BW-NEXT: vmovdqa 416(%rdi), %ymm7
+; AVX512BW-NEXT: vpmovqb %ymm7, %xmm8
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512BW-NEXT: vpmovqb %ymm7, %xmm7
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5],ymm5[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-NEXT: vmovdqa64 336(%rdi), %xmm31
+; AVX512BW-NEXT: vpshufb %xmm14, %xmm31, %xmm8
+; AVX512BW-NEXT: vmovdqa64 %xmm31, (%rsp) # 16-byte Spill
+; AVX512BW-NEXT: vmovdqa64 320(%rdi), %xmm23
+; AVX512BW-NEXT: vpshufb %xmm14, %xmm23, %xmm11
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; AVX512BW-NEXT: vmovdqa 352(%rdi), %xmm4
+; AVX512BW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm4, %xmm13
+; AVX512BW-NEXT: vpmovqb %zmm6, %xmm6
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3]
+; AVX512BW-NEXT: vpmovqb %zmm29, %xmm11
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3]
+; AVX512BW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm16
; AVX512BW-NEXT: movb $-64, %al
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20 {%k1}
-; AVX512BW-NEXT: vmovdqa64 240(%rdi), %xmm28
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm28, %xmm7
-; AVX512BW-NEXT: vmovdqa64 224(%rdi), %xmm18
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm18, %xmm10
-; AVX512BW-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512BW-NEXT: vmovdqa64 208(%rdi), %xmm17
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm17, %xmm10
-; AVX512BW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm8
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm8, %xmm16
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm16 {%k1}
+; AVX512BW-NEXT: vmovdqa 208(%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpshufb %xmm14, %xmm0, %xmm11
+; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpshufb %xmm14, %xmm0, %xmm13
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT: vmovdqa64 224(%rdi), %xmm30
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm30, %xmm17
+; AVX512BW-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm17[0],xmm9[0],xmm17[1],xmm9[1],xmm17[2],xmm9[2],xmm17[3],xmm9[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
+; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11
+; AVX512BW-NEXT: vpmovqb %ymm11, %xmm11
+; AVX512BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512BW-NEXT: vmovdqa64 160(%rdi), %ymm17
+; AVX512BW-NEXT: vpmovqb %ymm17, %xmm18
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,3,0,1]
+; AVX512BW-NEXT: vpmovqb %ymm17, %xmm17
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm12
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm9[6,7]
+; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm13
+; AVX512BW-NEXT: vpshufb %xmm14, %xmm13, %xmm17
+; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpshufb %xmm14, %xmm0, %xmm14
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3]
+; AVX512BW-NEXT: vmovdqa64 96(%rdi), %xmm20
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm20, %xmm15
+; AVX512BW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpmovqb %zmm19, %xmm17
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3]
+; AVX512BW-NEXT: vpmovqb %zmm3, %xmm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm5
+; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm8
; AVX512BW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm16[0],xmm10[0],xmm16[1],xmm10[1],xmm16[2],xmm10[2],xmm16[3],xmm10[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5,6],ymm7[7]
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm10
-; AVX512BW-NEXT: vpmovqb %ymm10, %xmm10
-; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7]
-; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm10
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm10, %xmm16
-; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm14
-; AVX512BW-NEXT: vpshufb %xmm12, %xmm14, %xmm12
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm16[0],xmm12[1],xmm16[1],xmm12[2],xmm16[2],xmm12[3],xmm16[3]
-; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm2
+; AVX512BW-NEXT: vmovdqa 240(%rdi), %xmm9
+; AVX512BW-NEXT: vmovdqa64 384(%rdi), %xmm17
+; AVX512BW-NEXT: vmovdqa64 400(%rdi), %xmm26
+; AVX512BW-NEXT: vmovdqa 416(%rdi), %xmm6
+; AVX512BW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vmovdqa64 432(%rdi), %xmm19
+; AVX512BW-NEXT: vmovdqa64 496(%rdi), %xmm21
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm21, %xmm12
+; AVX512BW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm10, %xmm22
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm22[0],xmm12[0],xmm22[1],xmm12[1],xmm22[2],xmm12[2],xmm22[3],xmm12[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm28 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT: vmovdqa64 %xmm2, %xmm16
; AVX512BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm2, %xmm21
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm2
+; AVX512BW-NEXT: vpshufb %xmm28, %xmm2, %xmm22
+; AVX512BW-NEXT: vmovdqa %xmm1, %xmm4
+; AVX512BW-NEXT: vpshufb %xmm28, %xmm1, %xmm24
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3]
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm25 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT: vpshufb %xmm25, %xmm19, %xmm22
+; AVX512BW-NEXT: vpshufb %xmm25, %xmm6, %xmm24
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3]
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm15
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm27 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512BW-NEXT: vpshufb %xmm27, %xmm26, %xmm22
+; AVX512BW-NEXT: vpshufb %xmm27, %xmm17, %xmm24
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3]
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5],ymm1[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7]
+; AVX512BW-NEXT: vmovdqa 368(%rdi), %xmm7
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm7, %xmm12
+; AVX512BW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512BW-NEXT: vpshufb %xmm28, %xmm31, %xmm15
+; AVX512BW-NEXT: vmovdqa64 %xmm23, %xmm6
+; AVX512BW-NEXT: vpshufb %xmm28, %xmm23, %xmm22
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm22[0],xmm15[0],xmm22[1],xmm15[1],xmm22[2],xmm15[2],xmm22[3],xmm15[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
+; AVX512BW-NEXT: vpsrlq $8, %zmm29, %zmm15
+; AVX512BW-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
+; AVX512BW-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm9, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm30, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm28, %xmm18, %xmm15
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm28, %xmm23, %xmm22
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm22[0],xmm15[0],xmm22[1],xmm15[1],xmm22[2],xmm15[2],xmm22[3],xmm15[3]
+; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm2
; AVX512BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm2, %xmm19
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vpmovqb %zmm5, %xmm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm6
-; AVX512BW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vmovdqa 384(%rdi), %xmm7
-; AVX512BW-NEXT: vmovdqa64 400(%rdi), %xmm21
-; AVX512BW-NEXT: vmovdqa64 416(%rdi), %xmm23
-; AVX512BW-NEXT: vmovdqa64 432(%rdi), %xmm29
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: vmovdqa64 %xmm24, %xmm19
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm24, %xmm1
-; AVX512BW-NEXT: vmovdqa64 %xmm25, %xmm11
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm25, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: vmovdqa64 %xmm26, %xmm12
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm26, %xmm24
-; AVX512BW-NEXT: vmovdqa64 %xmm30, %xmm16
-; AVX512BW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm30, %xmm25
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: vpshufb %xmm26, %xmm29, %xmm24
-; AVX512BW-NEXT: vpshufb %xmm26, %xmm23, %xmm25
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512BW-NEXT: vpshufb %xmm30, %xmm21, %xmm24
-; AVX512BW-NEXT: vpshufb %xmm30, %xmm7, %xmm25
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT: vpshufb %xmm25, %xmm2, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm25, %xmm8, %xmm25
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm25[0],xmm15[0],xmm25[1],xmm15[1],xmm25[2],xmm15[2],xmm25[3],xmm15[3]
+; AVX512BW-NEXT: vmovdqa64 144(%rdi), %xmm22
+; AVX512BW-NEXT: vpshufb %xmm27, %xmm22, %xmm31
+; AVX512BW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpshufb %xmm27, %xmm5, %xmm27
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm27 = xmm27[0],xmm31[0],xmm27[1],xmm31[1],xmm27[2],xmm31[2],xmm27[3],xmm31[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm2
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm31, %xmm2
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm27, %xmm3
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm22, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm9, %xmm24
-; AVX512BW-NEXT: vmovdqa64 %xmm9, %xmm31
-; AVX512BW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512BW-NEXT: vpsrlq $8, %zmm15, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm22
-; AVX512BW-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqa64 %xmm28, %xmm9
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm28, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm18, %xmm3
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm17, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm8, %xmm24
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3]
-; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT: vpshufb %xmm26, %xmm0, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm26, %xmm6, %xmm25
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm25[0],xmm3[0],xmm25[1],xmm3[1],xmm25[2],xmm3[2],xmm25[3],xmm3[3]
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm6
-; AVX512BW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vmovdqa 144(%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpshufb %xmm30, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufb %xmm30, %xmm6, %xmm30
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm30[0],xmm0[0],xmm30[1],xmm0[1],xmm30[2],xmm0[2],xmm30[3],xmm0[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm10, %xmm1
-; AVX512BW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm14, %xmm3
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm15, %xmm3
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm28, %xmm4
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512BW-NEXT: vpsrlq $8, %zmm5, %zmm3
-; AVX512BW-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm19, %xmm2
-; AVX512BW-NEXT: vmovdqa64 %xmm19, %xmm20
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm11, %xmm3
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm12, %xmm4
-; AVX512BW-NEXT: vmovdqa64 %xmm12, %xmm25
-; AVX512BW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm16, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: vmovdqa64 %xmm29, %xmm24
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm29, %xmm13
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm23, %xmm30
-; AVX512BW-NEXT: vmovdqa64 %xmm23, %xmm16
-; AVX512BW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512BW-NEXT: vmovdqa64 %xmm21, %xmm18
-; AVX512BW-NEXT: vpshufb %xmm30, %xmm21, %xmm0
-; AVX512BW-NEXT: vmovdqa64 %xmm7, %xmm17
-; AVX512BW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpshufb %xmm30, %xmm7, %xmm19
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm19[0],xmm0[0],xmm19[1],xmm0[1],xmm19[2],xmm0[2],xmm19[3],xmm0[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT: vmovdqa64 112(%rdi), %xmm27
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm27, %xmm2
+; AVX512BW-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm20, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512BW-NEXT: vpshufb %xmm28, %xmm13, %xmm2
+; AVX512BW-NEXT: vmovdqa64 %xmm13, %xmm24
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm28, %xmm14, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; AVX512BW-NEXT: vpsrlq $8, %zmm3, %zmm2
+; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8
+; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm21, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm10, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm16, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm4, %xmm28
+; AVX512BW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm19, %xmm28
+; AVX512BW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm20, %xmm31
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm28 = xmm31[0],xmm28[0],xmm31[1],xmm28[1],xmm31[2],xmm28[2],xmm31[3],xmm28[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm31 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512BW-NEXT: vpshufb %xmm31, %xmm26, %xmm0
+; AVX512BW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vmovdqa64 %xmm17, %xmm10
+; AVX512BW-NEXT: vpshufb %xmm31, %xmm17, %xmm12
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm12
; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm12, %xmm2
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm27, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm8, %xmm13
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm31, %xmm19
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
-; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm6
-; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpsrlq $16, %zmm22, %zmm13
-; AVX512BW-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm9, %xmm0
-; AVX512BW-NEXT: vmovdqa64 %xmm9, %xmm26
-; AVX512BW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm21, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm22, %xmm13
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm29, %xmm19
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm7, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm11, %xmm12
+; AVX512BW-NEXT: vmovdqa %xmm11, %xmm13
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; AVX512BW-NEXT: vmovdqa64 (%rsp), %xmm17 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm17, %xmm12
+; AVX512BW-NEXT: vmovdqa %xmm6, %xmm11
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm6, %xmm28
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3]
+; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm7
+; AVX512BW-NEXT: vpsrlq $16, %zmm29, %zmm12
+; AVX512BW-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm9, %xmm0
+; AVX512BW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm30, %xmm12
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
+; AVX512BW-NEXT: vmovdqa64 %xmm18, %xmm16
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm18, %xmm12
+; AVX512BW-NEXT: vmovdqa64 %xmm23, %xmm29
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm23, %xmm28
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm9, %xmm13
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm7, %xmm4
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm30, %xmm31, %xmm13
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm30, %xmm23, %xmm19
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm10, %xmm4
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm14, %xmm1
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm15, %xmm4
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm28, %xmm3
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512BW-NEXT: vpsrlq $16, %zmm5, %zmm3
-; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm28
+; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm25, %xmm12
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm6, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512BW-NEXT: vpshufb %xmm31, %xmm22, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm31, %xmm5, %xmm28
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5],ymm15[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm27, %xmm12
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm22, %xmm3
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm24, %xmm12
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm14, %xmm2
+; AVX512BW-NEXT: vmovdqa64 %xmm14, %xmm27
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
+; AVX512BW-NEXT: vpsrlq $16, %zmm8, %zmm3
; AVX512BW-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm20, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm21, %xmm1
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm18, %xmm2
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm25, %xmm3
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm21, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm4, %xmm12
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm24, %xmm4
-; AVX512BW-NEXT: vmovdqa64 %xmm24, (%rsp) # 16-byte Spill
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm16, %xmm19
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm19[0],xmm4[0],xmm19[1],xmm4[1],xmm19[2],xmm4[2],xmm19[3],xmm4[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm18, %xmm30
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm17, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm30[0],xmm13[1],xmm30[1],xmm13[2],xmm30[2],xmm13[3],xmm30[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm19, %xmm12
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm20, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm26, %xmm31
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm10, %xmm28
+; AVX512BW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm28 = xmm28[0],xmm31[0],xmm28[1],xmm31[1],xmm28[2],xmm31[2],xmm28[3],xmm31[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm4
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vmovdqa %xmm12, %xmm10
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm12, %xmm4
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm27, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm8, %xmm13
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm8, %xmm30
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3]
-; AVX512BW-NEXT: vpsrlq $24, %zmm6, %zmm13
-; AVX512BW-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3]
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm23, %xmm4
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm13, %xmm12
+; AVX512BW-NEXT: vmovdqa64 %xmm13, %xmm26
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm17, %xmm12
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm11, %xmm28
+; AVX512BW-NEXT: vmovdqa %xmm11, %xmm14
+; AVX512BW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3]
+; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512BW-NEXT: vpsrlq $24, %zmm7, %zmm12
+; AVX512BW-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3]
; AVX512BW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1}
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm26, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm21, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm22, %xmm13
-; AVX512BW-NEXT: vmovdqa64 %xmm22, %xmm21
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm29, %xmm30
-; AVX512BW-NEXT: vmovdqa64 %xmm29, %xmm22
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm9, %xmm1
+; AVX512BW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm30, %xmm12
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm16, %xmm12
+; AVX512BW-NEXT: vmovdqa64 %xmm29, %xmm13
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm29, %xmm28
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm9, %xmm13
-; AVX512BW-NEXT: vmovdqa64 %xmm9, %xmm25
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX512BW-NEXT: vmovdqa %xmm7, %xmm9
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm31, %xmm13
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm23, %xmm19
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm25, %xmm12
+; AVX512BW-NEXT: vmovdqa64 %xmm25, %xmm29
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX512BW-NEXT: vmovdqa %xmm6, %xmm8
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm19 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm19, %xmm12
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm5, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
+; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm5, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm14, %xmm0
-; AVX512BW-NEXT: vmovdqa64 %xmm14, %xmm23
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm30, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm22, %xmm0
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm15, %xmm3
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm29, %xmm2
+; AVX512BW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm24, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm27, %xmm2
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm17
-; AVX512BW-NEXT: vpsrlq $24, %zmm28, %zmm2
+; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512BW-NEXT: vpsrlq $24, %zmm7, %zmm2
; AVX512BW-NEXT: vpmovqb %zmm2, %xmm2
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm20, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm3
-; AVX512BW-NEXT: vmovdqa64 %xmm11, %xmm28
-; AVX512BW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm12, %xmm4
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm16, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm27, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm18, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm21, %xmm3
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm20, %xmm4
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm25, %xmm4
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm22, %xmm12
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm21, %xmm28
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm10, %xmm31
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm28 = xmm31[0],xmm28[0],xmm31[1],xmm28[1],xmm31[2],xmm28[2],xmm31[3],xmm28[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm24, %xmm13
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm26, %xmm19
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm18, %xmm30
-; AVX512BW-NEXT: vmovdqa64 %xmm18, %xmm24
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm18, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5],ymm15[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm23, %xmm4
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm26, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm17, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm14, %xmm28
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3]
+; AVX512BW-NEXT: vpsrlq $32, %zmm11, %zmm15
+; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
+; AVX512BW-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm15[0,1],xmm4[2,3]
+; AVX512BW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1}
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm9, %xmm1
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm16, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm13, %xmm28
+; AVX512BW-NEXT: vmovdqa %xmm13, %xmm14
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm29, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm8, %xmm3
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm19, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm5, %xmm12
+; AVX512BW-NEXT: vmovdqa64 %xmm5, %xmm19
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm30, %xmm3
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm13, %xmm0
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm24, %xmm3
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm9, %xmm2
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; AVX512BW-NEXT: vpsrlq $32, %zmm7, %zmm2
+; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17
+; AVX512BW-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT: vmovdqa64 %xmm27, %xmm24
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm27, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm18, %xmm3
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm8, %xmm4
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm20, %xmm12
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5],ymm2[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vmovdqa %xmm10, %xmm6
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm10, %xmm2
-; AVX512BW-NEXT: vmovdqa64 %xmm27, %xmm7
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm27, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm10, %xmm13
-; AVX512BW-NEXT: vmovdqa64 %xmm8, %xmm27
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm8, %xmm30
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512BW-NEXT: vpsrlq $32, %zmm10, %zmm13
-; AVX512BW-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT: vmovdqa64 %xmm25, %xmm23
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm25, %xmm12
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm22, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm21, %xmm28
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm29, %xmm31
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm28 = xmm31[0],xmm28[0],xmm31[1],xmm28[1],xmm31[2],xmm28[2],xmm31[3],xmm28[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm1
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5],ymm1[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm26, %xmm12
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
+; AVX512BW-NEXT: vmovdqa64 (%rsp), %xmm26 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm26, %xmm12
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm25, %xmm28
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3]
+; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5
+; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-NEXT: vpsrlq $40, %zmm10, %zmm12
+; AVX512BW-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3]
; AVX512BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm14, %xmm1
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm21, %xmm13
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm22, %xmm30
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm20, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm12
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm16, %xmm12
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm14, %xmm28
+; AVX512BW-NEXT: vmovdqa64 %xmm14, %xmm31
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm25, %xmm13
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm9, %xmm4
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm31, %xmm13
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm8, %xmm19
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm27, %xmm12
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm10, %xmm4
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm14, %xmm12
+; AVX512BW-NEXT: vpshufb %xmm15, %xmm19, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
+; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm5, %xmm4
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm23, %xmm0
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm30, %xmm4
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm13, %xmm0
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm15, %xmm4
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm29, %xmm3
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm7, %xmm4
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm9, %xmm3
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm20
-; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpsrlq $32, %zmm17, %zmm3
+; AVX512BW-NEXT: vpsrlq $40, %zmm17, %zmm3
; AVX512BW-NEXT: vpmovqb %zmm3, %xmm3
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm28, %xmm2
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm24, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm18, %xmm2
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm12, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm16, %xmm4
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm8, %xmm3
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm15, %xmm4
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm26, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm24, %xmm19
-; AVX512BW-NEXT: vmovdqa64 %xmm24, %xmm26
-; AVX512BW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm18, %xmm30
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm23, %xmm4
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm22, %xmm12
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm21, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm29, %xmm28
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5],ymm15[6,7]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm4
-; AVX512BW-NEXT: vmovdqa64 %xmm6, %xmm24
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm7, %xmm5
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm27, %xmm19
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512BW-NEXT: vpsrlq $40, %zmm10, %zmm5
-; AVX512BW-NEXT: vpmovqb %zmm5, %xmm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm4
+; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm30, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm26, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm25, %xmm28
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3]
+; AVX512BW-NEXT: vpsrlq $48, %zmm5, %zmm15
+; AVX512BW-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm15[0,1],xmm4[2,3]
; AVX512BW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1}
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm14, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm5
-; AVX512BW-NEXT: vmovdqa64 %xmm11, %xmm17
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm21, %xmm5
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm22, %xmm19
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm20, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm6, %xmm15
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm16, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm31, %xmm28
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm25, %xmm5
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm9, %xmm3
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm31, %xmm5
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm8, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm27, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm10, %xmm3
+; AVX512BW-NEXT: vmovdqa64 %xmm10, %xmm31
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm14, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm19, %xmm12
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
+; AVX512BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm23, %xmm0
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm13, %xmm0
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm6, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm29, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm7, %xmm3
+; AVX512BW-NEXT: vmovdqa64 %xmm7, %xmm28
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm9, %xmm2
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vpsrlq $40, %zmm20, %zmm2
+; AVX512BW-NEXT: vpsrlq $48, %zmm17, %zmm2
; AVX512BW-NEXT: vpmovqb %zmm2, %xmm2
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm28, %xmm1
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm8, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm12, %xmm4
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm16, %xmm5
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm15, %xmm5
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm15, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm26, %xmm19
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm18, %xmm30
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm3
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm24, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm7, %xmm5
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm27, %xmm5
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm30, %xmm19
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
-; AVX512BW-NEXT: vpsrlq $48, %zmm10, %zmm5
-; AVX512BW-NEXT: vpmovqb %zmm5, %xmm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1}
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm14, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm17, %xmm5
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm21, %xmm5
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm22, %xmm19
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm25, %xmm5
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm9, %xmm4
-; AVX512BW-NEXT: vmovdqa64 %xmm9, %xmm18
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm31, %xmm5
-; AVX512BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm13, %xmm20, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm4
-; AVX512BW-NEXT: vpshufb %xmm0, %xmm23, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm6, %xmm4
-; AVX512BW-NEXT: vmovdqa64 %xmm6, %xmm26
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm29, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512BW-NEXT: vpsrlq $48, %zmm19, %zmm2
-; AVX512BW-NEXT: vpmovqb %zmm2, %xmm2
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm28, %xmm2
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm8, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm24, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm18, %xmm3
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm12, %xmm4
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm16, %xmm5
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm8, %xmm10
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm5, %xmm5
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm15, %xmm8
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
-; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm6, %xmm9
-; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm6, %xmm13
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3]
-; AVX512BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm24, %xmm5
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm7, %xmm9
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm27, %xmm9
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm30, %xmm6
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512BW-NEXT: vpsrlq $56, %zmm10, %zmm6
-; AVX512BW-NEXT: vpmovqb %zmm6, %xmm6
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm23, %xmm10
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm22, %xmm12
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm21, %xmm15
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm29, %xmm18
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3]
+; AVX512BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5],ymm15[6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7]
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm5, %xmm10
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm30, %xmm8
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm26, %xmm10
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm25, %xmm5
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
+; AVX512BW-NEXT: vpsrlq $56, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
+; AVX512BW-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
; AVX512BW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5
; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 {%k1}
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm14, %xmm2
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm17, %xmm6
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm21, %xmm6
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm22, %xmm9
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm20, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm6, %xmm8
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm16, %xmm6
+; AVX512BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm7, %xmm7
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm25, %xmm6
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm18, %xmm4
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm27, %xmm6
+; AVX512BW-NEXT: vpshufb %xmm4, %xmm31, %xmm4
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm31, %xmm6
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm20, %xmm8
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm14, %xmm6
+; AVX512BW-NEXT: vpshufb %xmm12, %xmm19, %xmm7
+; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
; AVX512BW-NEXT: vpshufb %xmm1, %xmm11, %xmm4
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm23, %xmm1
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm13, %xmm1
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm26, %xmm4
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm29, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm28, %xmm4
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm9, %xmm3
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512BW-NEXT: vpsrlq $56, %zmm19, %zmm3
+; AVX512BW-NEXT: vpsrlq $56, %zmm17, %zmm3
; AVX512BW-NEXT: vpmovqb %zmm3, %xmm3
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -16066,343 +16151,343 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512BW-NEXT: addq $744, %rsp # imm = 0x2E8
+; AVX512BW-NEXT: addq $760, %rsp # imm = 0x2F8
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: load_i8_stride8_vf64:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: subq $264, %rsp # imm = 0x108
-; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm26
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: subq $360, %rsp # imm = 0x168
+; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm13
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm14
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm24
+; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16
+; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm23
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30
-; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm30, %ymm1
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
-; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm12
-; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vmovdqa64 416(%rdi), %ymm22
-; AVX512BW-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm20
-; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm20, %ymm2
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm27
-; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm9
-; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %ymm25
+; AVX512BW-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm20
+; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm20, %ymm1
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [8,10,10,11,8,10,12,14]
+; AVX512BW-FCP-NEXT: vpermd %zmm23, %zmm9, %zmm8
+; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm8, %ymm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm29
+; AVX512BW-FCP-NEXT: vpermd %ymm29, %ymm0, %ymm17
+; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm17, %ymm2
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpermd %zmm16, %zmm9, %zmm18
+; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm18, %ymm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 352(%rdi), %ymm24
-; AVX512BW-FCP-NEXT: vpmovqd %ymm24, %xmm18
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm2
-; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %ymm23
-; AVX512BW-FCP-NEXT: vpmovqd %ymm23, %xmm17
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm17, %xmm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512BW-FCP-NEXT: vpmovqb %zmm26, %xmm3
+; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %ymm22
+; AVX512BW-FCP-NEXT: vpmovqd %ymm22, %xmm7
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm2
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [8,10,12,14,12,14,14,15]
+; AVX512BW-FCP-NEXT: vpermd %zmm5, %zmm10, %zmm6
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
+; AVX512BW-FCP-NEXT: vpmovqb %zmm24, %xmm3
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm19
; AVX512BW-FCP-NEXT: movb $-64, %al
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm1
+; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm30
+; AVX512BW-FCP-NEXT: vpermd %ymm30, %ymm0, %ymm12
+; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm1
+; AVX512BW-FCP-NEXT: vpermd %zmm14, %zmm9, %zmm11
+; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm11, %ymm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26
+; AVX512BW-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm5
+; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm5, %ymm0
+; AVX512BW-FCP-NEXT: vpermd %zmm13, %zmm9, %zmm9
+; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm9, %ymm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm21
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm16
-; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm16, %ymm1
-; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
-; AVX512BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4
-; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512BW-FCP-NEXT: vmovdqa64 160(%rdi), %ymm31
-; AVX512BW-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm11
-; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21
-; AVX512BW-FCP-NEXT: vpermd %ymm21, %ymm0, %ymm7
-; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm0
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 96(%rdi), %ymm28
-; AVX512BW-FCP-NEXT: vpmovqd %ymm28, %xmm5
-; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm8
-; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25
-; AVX512BW-FCP-NEXT: vpmovqd %ymm25, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm8[3]
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8
-; AVX512BW-FCP-NEXT: vpmovqb %zmm8, %xmm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512BW-FCP-NEXT: vpermd %zmm21, %zmm10, %zmm2
+; AVX512BW-FCP-NEXT: vpmovqd %ymm1, %xmm1
+; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm10
+; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm10[0,1,2],xmm15[3]
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10
+; AVX512BW-FCP-NEXT: vpmovqb %zmm10, %xmm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm12, %ymm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm20, %ymm13
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm20, %ymm0
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm13[7]
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm17, %ymm13
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm18, %ymm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm18, %xmm13
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm26, %zmm15
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm27 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm7, %xmm13
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm28 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm28, %xmm6, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3]
+; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm24, %zmm15
; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm0
-; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm4, %ymm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm10, %ymm15
-; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm31, %ymm12, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm15
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm8, %zmm15
+; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm1, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb %xmm28, %xmm2, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3]
+; AVX512BW-FCP-NEXT: vpsrlq $8, %zmm10, %zmm15
; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm20, %ymm13
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm20, %ymm0
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm13[7]
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm17, %ymm13
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm18, %ymm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm13
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm17, %xmm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm26, %zmm15
-; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm28 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %xmm28, %xmm7, %xmm13
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm31 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %xmm31, %xmm6, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3]
+; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm24, %zmm14
+; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0
-; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm15
-; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm14
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm12, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm14
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm5, %ymm14
+; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm15
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm8, %zmm15
+; AVX512BW-FCP-NEXT: vpshufb %xmm28, %xmm1, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb %xmm31, %xmm2, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3]
+; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm10, %zmm15
; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm20, %ymm12
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm9, %ymm9
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm12
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm17, %xmm6
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
-; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm26, %zmm12
-; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm20, %ymm0
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm8, %ymm8
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm17, %ymm8
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm18, %ymm13
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm18 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %xmm18, %xmm7, %xmm7
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm6, %xmm6
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
+; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm24, %zmm7
+; AVX512BW-FCP-NEXT: vpmovqb %zmm7, %xmm7
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0
-; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm3
-; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm7, %ymm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm1
-; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm11, %xmm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm8, %zmm2
+; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm12, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm11, %ymm4
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm5, %ymm3
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm4
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm18, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512BW-FCP-NEXT: vpsrlq $24, %zmm10, %zmm2
; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7]
-; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512BW-FCP-NEXT: vpermd %ymm22, %ymm6, %ymm2
-; AVX512BW-FCP-NEXT: vpermd %ymm27, %ymm6, %ymm3
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm5
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm3, %ymm11
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7]
-; AVX512BW-FCP-NEXT: vpermd %ymm24, %ymm12, %ymm4
-; AVX512BW-FCP-NEXT: vpermd %ymm23, %ymm12, %ymm5
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm4, %xmm13
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm5, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
-; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm26, %zmm14
-; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm14
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm14 {%k1}
-; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm13
-; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm17 # 32-byte Folded Reload
-; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm17, %ymm15
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512BW-FCP-NEXT: vpermd %ymm31, %ymm6, %ymm16
-; AVX512BW-FCP-NEXT: vpermd %ymm21, %ymm6, %ymm15
-; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm6
-; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
-; AVX512BW-FCP-NEXT: vpermd %ymm28, %ymm12, %ymm13
-; AVX512BW-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm18
-; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm13, %xmm10
-; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm18, %xmm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
-; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm8, %zmm12
-; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm6
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
+; AVX512BW-FCP-NEXT: vpermd %ymm25, %ymm6, %ymm0
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [9,11,10,11,9,11,13,15]
+; AVX512BW-FCP-NEXT: vpermd %zmm23, %zmm13, %zmm1
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm2
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm1, %ymm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512BW-FCP-NEXT: vpermd %ymm29, %ymm6, %ymm2
+; AVX512BW-FCP-NEXT: vpermd %zmm16, %zmm13, %zmm3
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm5
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm3, %ymm7
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5],ymm5[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [9,11,13,15,13,15,14,15]
+; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm4 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7]
+; AVX512BW-FCP-NEXT: vpermd %ymm22, %ymm16, %ymm5
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm9
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm8
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3]
+; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm24, %zmm9
+; AVX512BW-FCP-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm8 {%k1}
+; AVX512BW-FCP-NEXT: vpermd %ymm30, %ymm6, %ymm7
+; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm14
+; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm9 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm9, %ymm12
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm12[7]
+; AVX512BW-FCP-NEXT: vpermd %ymm26, %ymm6, %ymm12
+; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload
+; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm6
+; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm13, %ymm11
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6,7]
+; AVX512BW-FCP-NEXT: vpermd %zmm21, %zmm23, %zmm14
+; AVX512BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm16 # 32-byte Folded Reload
+; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm14, %xmm11
+; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm16, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm10, %zmm15
+; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm22
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm6
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm1, %ymm8
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm2, %ymm12
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm3, %ymm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm4, %xmm12
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm22 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm26, %zmm14
-; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm12 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm10
-; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm17, %ymm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm16, %ymm14
-; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm15, %ymm9
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm13, %xmm10
-; AVX512BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3]
-; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm8, %zmm14
-; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm21
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm9
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm1, %ymm10
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm10
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm4, %xmm10
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm5, %xmm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
-; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm26, %zmm12
-; AVX512BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
-; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm10 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm9
-; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm17, %ymm12
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm12
-; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm13, %xmm12
-; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm18, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm8, %zmm14
-; AVX512BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
-; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm2, %ymm8
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm26 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm3, %ymm11
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm4, %xmm8
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm25 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm5, %xmm11
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
+; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm24, %zmm11
+; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm8 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm6
+; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm9, %ymm11
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm11[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm12, %ymm11
+; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm13, %ymm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm14, %xmm11
+; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm16, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512BW-FCP-NEXT: vpsrlq $40, %zmm10, %zmm15
+; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm21
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm6
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm1, %ymm8
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm2, %ymm8
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm3, %ymm11
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm31, %xmm4, %xmm8
+; AVX512BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm5, %xmm11
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
+; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm24, %zmm11
+; AVX512BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
+; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm8 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm6
+; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm9, %ymm11
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm11[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm12, %ymm11
+; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm13, %ymm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm31, %xmm14, %xmm11
+; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm16, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm10, %zmm15
+; AVX512BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm1, %ymm1
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm2, %ymm1
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm3, %ymm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm4, %xmm1
-; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm5, %xmm2
+; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm4, %xmm1
+; AVX512BW-FCP-NEXT: vpshufb %xmm18, %xmm5, %xmm2
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm26, %zmm2
+; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm24, %zmm2
; AVX512BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm0
-; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm17, %ymm2
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm16, %ymm2
-; AVX512BW-FCP-NEXT: vpshufb %ymm30, %ymm15, %ymm3
-; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm7, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm9, %ymm2
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm12, %ymm2
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm13, %ymm3
+; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm13, %xmm2
-; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm18, %xmm3
+; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm14, %xmm2
+; AVX512BW-FCP-NEXT: vpshufb %xmm18, %xmm16, %xmm3
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm8, %zmm3
+; AVX512BW-FCP-NEXT: vpsrlq $56, %zmm10, %zmm3
; AVX512BW-FCP-NEXT: vpmovqb %zmm3, %xmm3
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
@@ -16413,671 +16498,674 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rdx)
; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%rcx)
-; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload
+; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%r8)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%r9)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, (%r9)
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rax)
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax)
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512BW-FCP-NEXT: addq $264, %rsp # imm = 0x108
+; AVX512BW-FCP-NEXT: addq $360, %rsp # imm = 0x168
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: load_i8_stride8_vf64:
; AVX512DQ-BW: # %bb.0:
-; AVX512DQ-BW-NEXT: subq $744, %rsp # imm = 0x2E8
-; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5
-; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm0
-; AVX512DQ-BW-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1
-; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2
-; AVX512DQ-BW-NEXT: vpmovqb %zmm2, %xmm2
-; AVX512DQ-BW-NEXT: vmovdqa64 496(%rdi), %xmm24
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm24, %xmm3
-; AVX512DQ-BW-NEXT: vmovdqa64 480(%rdi), %xmm25
-; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm25, %xmm4
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vmovdqa64 464(%rdi), %xmm26
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm26, %xmm4
-; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %xmm30
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm30, %xmm6
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm4
-; AVX512DQ-BW-NEXT: vpmovqb %ymm4, %xmm4
+; AVX512DQ-BW-NEXT: subq $760, %rsp # imm = 0x2F8
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3
+; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm19
+; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm9
+; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm29
+; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm6
+; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm4
+; AVX512DQ-BW-NEXT: vmovdqa 464(%rdi), %xmm2
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm14 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm2, %xmm5
+; AVX512DQ-BW-NEXT: vmovdqa 448(%rdi), %xmm1
+; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm1, %xmm7
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-BW-NEXT: vmovdqa 480(%rdi), %xmm10
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [0,8,0,8,0,8,0,8,0,8,0,8,0,8,0,8]
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm10, %xmm7
+; AVX512DQ-BW-NEXT: vpmovqb %zmm4, %xmm4
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm2[0,1,2,3,4,5],ymm3[6,7]
-; AVX512DQ-BW-NEXT: vmovdqa64 368(%rdi), %xmm31
-; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm31, %xmm4
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vmovdqa64 352(%rdi), %xmm27
-; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm27, %xmm6
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vmovdqa64 336(%rdi), %xmm22
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm22, %xmm11
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vmovdqa 320(%rdi), %xmm9
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm9, %xmm15
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX512DQ-BW-NEXT: vpmovqb %zmm1, %xmm11
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm20
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %ymm5
+; AVX512DQ-BW-NEXT: vpmovqb %ymm5, %xmm5
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX512DQ-BW-NEXT: vmovdqa 416(%rdi), %ymm7
+; AVX512DQ-BW-NEXT: vpmovqb %ymm7, %xmm8
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpmovqb %ymm7, %xmm7
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5],ymm5[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-NEXT: vmovdqa64 336(%rdi), %xmm31
+; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm31, %xmm8
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm31, (%rsp) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %xmm23
+; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm23, %xmm11
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
+; AVX512DQ-BW-NEXT: vmovdqa 352(%rdi), %xmm4
+; AVX512DQ-BW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm4, %xmm13
+; AVX512DQ-BW-NEXT: vpmovqb %zmm6, %xmm6
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3]
+; AVX512DQ-BW-NEXT: vpmovqb %zmm29, %xmm11
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3]
+; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm16
; AVX512DQ-BW-NEXT: movb $-64, %al
; AVX512DQ-BW-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT: vmovdqa64 240(%rdi), %xmm28
-; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm28, %xmm7
-; AVX512DQ-BW-NEXT: vmovdqa64 224(%rdi), %xmm18
-; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm18, %xmm10
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
-; AVX512DQ-BW-NEXT: vmovdqa64 208(%rdi), %xmm17
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm17, %xmm10
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm8
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm8, %xmm16
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm16 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqa 208(%rdi), %xmm0
+; AVX512DQ-BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm0, %xmm11
+; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm0
+; AVX512DQ-BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm0, %xmm13
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-NEXT: vmovdqa64 224(%rdi), %xmm30
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm30, %xmm17
+; AVX512DQ-BW-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm17[0],xmm9[0],xmm17[1],xmm9[1],xmm17[2],xmm9[2],xmm17[3],xmm9[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
+; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm11
+; AVX512DQ-BW-NEXT: vpmovqb %ymm11, %xmm11
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
+; AVX512DQ-BW-NEXT: vmovdqa64 160(%rdi), %ymm17
+; AVX512DQ-BW-NEXT: vpmovqb %ymm17, %xmm18
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpmovqb %ymm17, %xmm17
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm17 = xmm17[0],xmm18[0],xmm17[1],xmm18[1],xmm17[2],xmm18[2],xmm17[3],xmm18[3]
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm9[6,7]
+; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm13
+; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm13, %xmm17
+; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm0
+; AVX512DQ-BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpshufb %xmm14, %xmm0, %xmm14
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm17[0],xmm14[1],xmm17[1],xmm14[2],xmm17[2],xmm14[3],xmm17[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 96(%rdi), %xmm20
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm20, %xmm15
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpmovqb %zmm19, %xmm17
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm15[0],xmm17[0],xmm15[1],xmm17[1],xmm15[2],xmm17[2],xmm15[3],xmm17[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm15[3]
+; AVX512DQ-BW-NEXT: vpmovqb %zmm3, %xmm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7]
+; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
+; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm5
+; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm8
; AVX512DQ-BW-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm16[0],xmm10[0],xmm16[1],xmm10[1],xmm16[2],xmm10[2],xmm16[3],xmm10[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5,6],ymm7[7]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm10
-; AVX512DQ-BW-NEXT: vpmovqb %ymm10, %xmm10
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5],ymm10[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm7[6,7]
-; AVX512DQ-BW-NEXT: vmovdqa 112(%rdi), %xmm10
-; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm10, %xmm16
-; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm14
-; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm14, %xmm12
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm16[0],xmm12[1],xmm16[1],xmm12[2],xmm16[2],xmm12[3],xmm16[3]
-; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm2
+; AVX512DQ-BW-NEXT: vmovdqa 240(%rdi), %xmm9
+; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %xmm17
+; AVX512DQ-BW-NEXT: vmovdqa64 400(%rdi), %xmm26
+; AVX512DQ-BW-NEXT: vmovdqa 416(%rdi), %xmm6
+; AVX512DQ-BW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vmovdqa64 432(%rdi), %xmm19
+; AVX512DQ-BW-NEXT: vmovdqa64 496(%rdi), %xmm21
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm21, %xmm12
+; AVX512DQ-BW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm10, %xmm22
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm22[0],xmm12[0],xmm22[1],xmm12[1],xmm22[2],xmm12[2],xmm22[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm28 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm2, %xmm16
; AVX512DQ-BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm2, %xmm21
-; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm2
+; AVX512DQ-BW-NEXT: vpshufb %xmm28, %xmm2, %xmm22
+; AVX512DQ-BW-NEXT: vmovdqa %xmm1, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb %xmm28, %xmm1, %xmm24
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3]
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm25 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm19, %xmm22
+; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm6, %xmm24
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3]
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm27 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
+; AVX512DQ-BW-NEXT: vpshufb %xmm27, %xmm26, %xmm22
+; AVX512DQ-BW-NEXT: vpshufb %xmm27, %xmm17, %xmm24
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm22 = xmm24[0],xmm22[0],xmm24[1],xmm22[1],xmm24[2],xmm22[2],xmm24[3],xmm22[3]
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm22, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm15[5],ymm1[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm12[6,7]
+; AVX512DQ-BW-NEXT: vmovdqa 368(%rdi), %xmm7
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm7, %xmm12
+; AVX512DQ-BW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm28, %xmm31, %xmm15
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm23, %xmm6
+; AVX512DQ-BW-NEXT: vpshufb %xmm28, %xmm23, %xmm22
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm22[0],xmm15[0],xmm22[1],xmm15[1],xmm22[2],xmm15[2],xmm22[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1,2],xmm12[3]
+; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm29, %zmm15
+; AVX512DQ-BW-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3]
+; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm12 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm9, %xmm1
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm30, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm28, %xmm18, %xmm15
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm28, %xmm23, %xmm22
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm22[0],xmm15[0],xmm22[1],xmm15[1],xmm22[2],xmm15[2],xmm22[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm2
; AVX512DQ-BW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm2, %xmm19
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm21[0],xmm19[1],xmm21[1],xmm19[2],xmm21[2],xmm19[3],xmm21[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT: vpmovqb %zmm5, %xmm13
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT: vmovdqa 160(%rdi), %xmm6
-; AVX512DQ-BW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vmovdqa 384(%rdi), %xmm7
-; AVX512DQ-BW-NEXT: vmovdqa64 400(%rdi), %xmm21
-; AVX512DQ-BW-NEXT: vmovdqa64 416(%rdi), %xmm23
-; AVX512DQ-BW-NEXT: vmovdqa64 432(%rdi), %xmm29
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm24, %xmm19
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm24, %xmm1
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm25, %xmm11
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm25, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm26, %xmm12
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm26, %xmm24
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm30, %xmm16
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm30, %xmm25
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm26 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm29, %xmm24
-; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm23, %xmm25
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
-; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm21, %xmm24
-; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm7, %xmm25
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm24 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3]
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm2, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm25, %xmm8, %xmm25
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm25[0],xmm15[0],xmm25[1],xmm15[1],xmm25[2],xmm15[2],xmm25[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 144(%rdi), %xmm22
+; AVX512DQ-BW-NEXT: vpshufb %xmm27, %xmm22, %xmm31
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpshufb %xmm27, %xmm5, %xmm27
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm27 = xmm27[0],xmm31[0],xmm27[1],xmm31[1],xmm27[2],xmm31[2],xmm27[3],xmm31[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm27, %ymm0, %ymm2
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm15[5],ymm2[6,7]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm31, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm27, %xmm3
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm22, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm9, %xmm24
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm9, %xmm31
-; AVX512DQ-BW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm15, %zmm3
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm22
-; AVX512DQ-BW-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm28, %xmm9
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm28, %xmm1
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm18, %xmm3
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm17, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm8, %xmm24
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm24[0],xmm3[0],xmm24[1],xmm3[1],xmm24[2],xmm3[2],xmm24[3],xmm3[3]
-; AVX512DQ-BW-NEXT: vmovdqa 176(%rdi), %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vmovdqa64 112(%rdi), %xmm27
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm27, %xmm2
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm20, %xmm0
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm28, %xmm13, %xmm2
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm13, %xmm24
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm28, %xmm14, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm3, %zmm2
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm8
+; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
+; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm21, %xmm1
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm10, %xmm2
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm16, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm4, %xmm28
+; AVX512DQ-BW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm0, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm6, %xmm25
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm25[0],xmm3[0],xmm25[1],xmm3[1],xmm25[2],xmm3[2],xmm25[3],xmm3[3]
-; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm6
-; AVX512DQ-BW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vmovdqa 144(%rdi), %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm0, %xmm0
-; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm6, %xmm30
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm30[0],xmm0[0],xmm30[1],xmm0[1],xmm30[2],xmm0[2],xmm30[3],xmm0[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm19, %xmm28
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm20, %xmm31
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm28 = xmm31[0],xmm28[0],xmm31[1],xmm28[1],xmm31[2],xmm28[2],xmm31[3],xmm28[3]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm31 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
+; AVX512DQ-BW-NEXT: vpshufb %xmm31, %xmm26, %xmm0
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm17, %xmm10
+; AVX512DQ-BW-NEXT: vpshufb %xmm31, %xmm17, %xmm12
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm12
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm10, %xmm1
-; AVX512DQ-BW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm14, %xmm3
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm15, %xmm3
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm28, %xmm4
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512DQ-BW-NEXT: vpsrlq $8, %zmm5, %zmm3
-; AVX512DQ-BW-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
-; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm19, %xmm2
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm19, %xmm20
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm11, %xmm3
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm12, %xmm4
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm12, %xmm25
-; AVX512DQ-BW-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm16, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm29, %xmm24
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm29, %xmm13
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm23, %xmm30
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm23, %xmm16
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm30 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm21, %xmm18
-; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm21, %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm7, %xmm17
-; AVX512DQ-BW-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm7, %xmm19
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm19[0],xmm0[0],xmm19[1],xmm0[1],xmm19[2],xmm0[2],xmm19[3],xmm0[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm7, %xmm1
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm11, %xmm12
+; AVX512DQ-BW-NEXT: vmovdqa %xmm11, %xmm13
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rsp), %xmm17 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm17, %xmm12
+; AVX512DQ-BW-NEXT: vmovdqa %xmm6, %xmm11
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm6, %xmm28
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1,2],xmm1[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm7
+; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm29, %zmm12
+; AVX512DQ-BW-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
+; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm9, %xmm0
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm30, %xmm12
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm18, %xmm16
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm18, %xmm12
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm23, %xmm29
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm23, %xmm28
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm12, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm27, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm8, %xmm13
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm31, %xmm19
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm6
-; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm22, %zmm13
-; AVX512DQ-BW-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm9, %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm9, %xmm26
-; AVX512DQ-BW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm21, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm25, %xmm12
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm6, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm31, %xmm22, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm31, %xmm5, %xmm28
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3,4],ymm12[5],ymm15[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm27, %xmm12
; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm22, %xmm13
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm29, %xmm19
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm9, %xmm13
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm7, %xmm4
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm31, %xmm13
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm30, %xmm23, %xmm19
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm10, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm14, %xmm1
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm15, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm28, %xmm3
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm5, %zmm3
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm28
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm22, %xmm3
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm24, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm14, %xmm2
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm14, %xmm27
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
+; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm8, %zmm3
; AVX512DQ-BW-NEXT: vpmovqb %zmm3, %xmm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm20, %xmm1
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm2
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm21, %xmm1
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm18, %xmm2
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm25, %xmm3
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm21, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm4, %xmm12
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm24, %xmm4
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm24, (%rsp) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm16, %xmm19
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm19[0],xmm4[0],xmm19[1],xmm4[1],xmm19[2],xmm4[2],xmm19[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm18, %xmm30
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm17, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm30[0],xmm13[1],xmm30[1],xmm13[2],xmm30[2],xmm13[3],xmm30[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm19, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm20, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm26, %xmm31
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm10, %xmm28
+; AVX512DQ-BW-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm28 = xmm28[0],xmm31[0],xmm28[1],xmm31[1],xmm28[2],xmm31[2],xmm28[3],xmm31[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT: vmovdqa %xmm12, %xmm10
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm12, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm27, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm8, %xmm13
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm8, %xmm30
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3]
-; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm6, %zmm13
-; AVX512DQ-BW-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm23 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm23, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm13, %xmm12
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm13, %xmm26
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm17, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm11, %xmm28
+; AVX512DQ-BW-NEXT: vmovdqa %xmm11, %xmm14
+; AVX512DQ-BW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11
+; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm7, %zmm12
+; AVX512DQ-BW-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3]
; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm26, %xmm1
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm21, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm22, %xmm13
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm22, %xmm21
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm29, %xmm30
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm29, %xmm22
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm9, %xmm1
+; AVX512DQ-BW-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm30, %xmm12
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm16, %xmm12
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm29, %xmm13
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm29, %xmm28
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm9, %xmm13
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm9, %xmm25
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX512DQ-BW-NEXT: vmovdqa %xmm7, %xmm9
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm31, %xmm13
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm23, %xmm19
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm25, %xmm12
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm25, %xmm29
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm6, %xmm3
+; AVX512DQ-BW-NEXT: vmovdqa %xmm6, %xmm8
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm19 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm19, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm5, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm5, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm14, %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm14, %xmm23
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm30, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm22, %xmm0
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm15, %xmm3
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm29, %xmm2
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm24, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm27, %xmm2
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm17
-; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm28, %zmm2
+; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-BW-NEXT: vpsrlq $24, %zmm7, %zmm2
; AVX512DQ-BW-NEXT: vpmovqb %zmm2, %xmm2
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm20, %xmm1
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm3
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm11, %xmm28
-; AVX512DQ-BW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm12, %xmm4
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm16 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm16, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm27, %xmm1
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm18, %xmm2
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm21, %xmm3
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm20, %xmm4
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm25, %xmm4
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm22 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm22, %xmm12
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm21 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm21, %xmm28
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm10, %xmm31
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm28 = xmm31[0],xmm28[0],xmm31[1],xmm28[1],xmm31[2],xmm28[2],xmm31[3],xmm28[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm24, %xmm13
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm26, %xmm19
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm19 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm18, %xmm30
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm18, %xmm24
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm18 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm18, %xmm2
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm30[0],xmm2[1],xmm30[1],xmm2[2],xmm30[2],xmm2[3],xmm30[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5],ymm15[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm23, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm26, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm17, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm14, %xmm28
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3]
+; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm11, %zmm15
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
+; AVX512DQ-BW-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm15[0,1],xmm4[2,3]
+; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm9, %xmm1
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm16, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm13, %xmm28
+; AVX512DQ-BW-NEXT: vmovdqa %xmm13, %xmm14
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm29, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm8, %xmm3
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm19, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm5, %xmm12
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm5, %xmm19
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm30, %xmm3
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm13, %xmm0
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm24, %xmm3
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm9, %xmm2
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
+; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm7, %zmm2
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17
+; AVX512DQ-BW-NEXT: vpmovqb %zmm2, %xmm2
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm27, %xmm24
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm27, %xmm2
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm18, %xmm3
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm8, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm20, %xmm12
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm13[5],ymm2[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT: vmovdqa %xmm10, %xmm6
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm10, %xmm2
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm27, %xmm7
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm27, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3]
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm10, %xmm13
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm8, %xmm27
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm8, %xmm30
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
-; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
-; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm10, %zmm13
-; AVX512DQ-BW-NEXT: vpmovqb %zmm13, %xmm13
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm25, %xmm23
+; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm25, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm22, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm15 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm21, %xmm28
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm29, %xmm31
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm28 = xmm31[0],xmm28[0],xmm31[1],xmm28[1],xmm31[2],xmm28[2],xmm31[3],xmm28[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm28, %ymm0, %ymm1
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5],ymm1[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm2
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm26, %xmm12
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rsp), %xmm26 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm26, %xmm12
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm25, %xmm28
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5
+; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm10, %zmm12
+; AVX512DQ-BW-NEXT: vpmovqb %zmm12, %xmm12
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3]
; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm2
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm14, %xmm1
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1],xmm13[2],xmm1[2],xmm13[3],xmm1[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm21, %xmm13
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm22, %xmm30
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm30[0],xmm13[0],xmm30[1],xmm13[1],xmm30[2],xmm13[2],xmm30[3],xmm13[3]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm20, %xmm1
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm12
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm16, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm14, %xmm28
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm14, %xmm31
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm28[0],xmm12[0],xmm28[1],xmm12[1],xmm28[2],xmm12[2],xmm28[3],xmm12[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm25, %xmm13
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm9, %xmm4
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm31, %xmm13
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm8, %xmm19
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm19[0],xmm13[0],xmm19[1],xmm13[1],xmm19[2],xmm13[2],xmm19[3],xmm13[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm27, %xmm12
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm10, %xmm4
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm14, %xmm12
+; AVX512DQ-BW-NEXT: vpshufb %xmm15, %xmm19, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3,4],ymm4[5],ymm13[6,7]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm5, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm23, %xmm0
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm30, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm13, %xmm0
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm15, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm29, %xmm3
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm7, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm9, %xmm3
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm20
-; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm17, %zmm3
+; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm17, %zmm3
; AVX512DQ-BW-NEXT: vpmovqb %zmm3, %xmm3
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm1, %xmm1
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm28, %xmm2
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm24, %xmm1
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm18, %xmm2
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm12, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm16, %xmm4
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm8, %xmm3
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm8, %xmm4
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm15, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm26, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm24, %xmm19
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm24, %xmm26
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm18, %xmm30
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm23, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm22, %xmm12
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm21, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm29, %xmm28
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1,2,3,4],ymm4[5],ymm15[6,7]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm4
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm6, %xmm24
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm7, %xmm5
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm5, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm27, %xmm19
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3]
-; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm10, %zmm5
-; AVX512DQ-BW-NEXT: vpmovqb %zmm5, %xmm5
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm4
+; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm30, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm26, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm25, %xmm28
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3]
+; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm5, %zmm15
+; AVX512DQ-BW-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm15[0,1],xmm4[2,3]
; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm14, %xmm1
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm5
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm11, %xmm17
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm21, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm22, %xmm19
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm20, %xmm1
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm6, %xmm15
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm16, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm31, %xmm28
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm28[0],xmm15[0],xmm28[1],xmm15[1],xmm28[2],xmm15[2],xmm28[3],xmm15[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm25, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm9, %xmm3
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm31, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm8, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm27, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm10, %xmm3
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm10, %xmm31
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm14, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm19, %xmm12
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1,2,3,4],ymm3[5],ymm12[6,7]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm23, %xmm0
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm13, %xmm0
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm6, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm29, %xmm2
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm7, %xmm3
+; AVX512DQ-BW-NEXT: vmovdqa64 %xmm7, %xmm28
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm9, %xmm2
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT: vpsrlq $40, %zmm20, %zmm2
+; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm17, %zmm2
; AVX512DQ-BW-NEXT: vpmovqb %zmm2, %xmm2
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
-; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm0 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm28 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm28, %xmm1
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm8, %xmm2
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm12, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm16, %xmm5
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm15, %xmm5
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm15, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm13 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm26, %xmm19
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm18, %xmm30
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm19 = xmm30[0],xmm19[0],xmm30[1],xmm19[1],xmm30[2],xmm19[2],xmm30[3],xmm19[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm3
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm24, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm7, %xmm5
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm27, %xmm5
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm30 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm30, %xmm19
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
-; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm10, %zmm5
-; AVX512DQ-BW-NEXT: vpmovqb %zmm5, %xmm5
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
-; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm14, %xmm1
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm17, %xmm5
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm21, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm22, %xmm19
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm19[0],xmm5[0],xmm19[1],xmm5[1],xmm19[2],xmm5[2],xmm19[3],xmm5[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm25, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm9, %xmm4
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm9, %xmm18
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm31, %xmm5
-; AVX512DQ-BW-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm20 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm13, %xmm20, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm23, %xmm0
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm6, %xmm4
-; AVX512DQ-BW-NEXT: vmovdqa64 %xmm6, %xmm26
-; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm29, %xmm2
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
-; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm19, %zmm2
-; AVX512DQ-BW-NEXT: vpmovqb %zmm2, %xmm2
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm28, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm8, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm24, %xmm2
+; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm18, %xmm3
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm12, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm16, %xmm5
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm4, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm8, %xmm10
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm5, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm15, %xmm8
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
-; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm8 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm6, %xmm9
-; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm6, %xmm13
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3]
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
-; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm24, %xmm5
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm7, %xmm9
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm27, %xmm9
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm30, %xmm6
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3]
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
-; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm10, %zmm6
-; AVX512DQ-BW-NEXT: vpmovqb %zmm6, %xmm6
-; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm23, %xmm10
+; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm22, %xmm12
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; AVX512DQ-BW-NEXT: vpbroadcastw {{.*#+}} xmm12 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm21, %xmm15
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm29, %xmm18
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3]
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
+; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4],ymm10[5],ymm15[6,7]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3,4,5],ymm2[6,7]
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm5, %xmm10
+; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm30, %xmm8
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm26, %xmm10
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm25, %xmm5
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
+; AVX512DQ-BW-NEXT: vpsrlq $56, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Folded Reload
+; AVX512DQ-BW-NEXT: vpmovqb %zmm8, %xmm8
+; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3]
; AVX512DQ-BW-NEXT: vinserti32x4 $2, %xmm5, %zmm0, %zmm5
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm5 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm14, %xmm2
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm17, %xmm6
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm21, %xmm6
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm22, %xmm9
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm20, %xmm2
+; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm6, %xmm8
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm16, %xmm6
+; AVX512DQ-BW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm7, %xmm7
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm25, %xmm6
-; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm18, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm27, %xmm6
+; AVX512DQ-BW-NEXT: vpshufb %xmm4, %xmm31, %xmm4
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm31, %xmm6
-; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm20, %xmm8
-; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm14, %xmm6
+; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm19, %xmm7
+; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5],ymm6[6,7]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm11, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm23, %xmm1
+; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm13, %xmm1
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm26, %xmm4
-; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm29, %xmm3
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm28, %xmm4
+; AVX512DQ-BW-NEXT: vpshufb %xmm3, %xmm9, %xmm3
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm19, %zmm3
+; AVX512DQ-BW-NEXT: vpsrlq $56, %zmm17, %zmm3
; AVX512DQ-BW-NEXT: vpmovqb %zmm3, %xmm3
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -17099,343 +17187,343 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512DQ-BW-NEXT: addq $744, %rsp # imm = 0x2E8
+; AVX512DQ-BW-NEXT: addq $760, %rsp # imm = 0x2F8
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: load_i8_stride8_vf64:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: subq $264, %rsp # imm = 0x108
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm26
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: subq $360, %rsp # imm = 0x168
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm14
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm24
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm16
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm23
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm30
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm30, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm12, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 416(%rdi), %ymm22
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm22, %ymm0, %ymm20
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm20, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm27
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm0, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm9, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %ymm25
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm25, %ymm0, %ymm20
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm20, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [8,10,10,11,8,10,12,14]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm23, %zmm9, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm8, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %ymm29
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm29, %ymm0, %ymm17
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm17, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm16, %zmm9, %zmm18
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm18, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 352(%rdi), %ymm24
-; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm24, %xmm18
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %ymm23
-; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm23, %xmm17
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm13 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm17, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm26, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %ymm22
+; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm22, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm7, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [8,10,12,14,12,14,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm5, %zmm10, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3]
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm24, %xmm3
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm19
; AVX512DQ-BW-FCP-NEXT: movb $-64, %al
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 224(%rdi), %ymm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm30
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm30, %ymm0, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm14, %zmm9, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm11, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm26
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm26, %ymm0, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm5, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm13, %zmm9, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm9, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm21
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm0, %ymm16
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm16, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm2, %ymm0, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6],ymm1[7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 160(%rdi), %ymm31
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm31, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %ymm21
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm21, %ymm0, %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 96(%rdi), %ymm28
-; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm28, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm25
-; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm25, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm8[3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm8, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm21, %zmm10, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpmovqd %ymm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm1, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm10[0,1,2],xmm15[3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm10, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm30, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm12, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm20, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm31 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm20, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm13[7]
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm17, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm18, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm18, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm17, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm26, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm27 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm7, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm28 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm28, %xmm6, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm24, %zmm15
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm16, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm4, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm10, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm31, %ymm12, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm8, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm1, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm28, %xmm2, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $8, %zmm10, %zmm15
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5,6],ymm0[7]
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm20, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm20, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm13[7]
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm17, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm18, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm17, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm26, %zmm15
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm28 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm28, %xmm7, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm31 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm31, %xmm6, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm24, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm13
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm13 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm12, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm14[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm5, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm15
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5],ymm14[6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1,2],xmm14[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm8, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm28, %xmm1, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm31, %xmm2, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm10, %zmm15
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm13, %zmm0
; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm30, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm13 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm12, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm20, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm30 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm9, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm18, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm17, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm26, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm28 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm20, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm8, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm17, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm19 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm18, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm18 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm18, %xmm7, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm17 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm6, %xmm6
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm24, %zmm7
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm7, %xmm7
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm6, %zmm0, %zmm6
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm16, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm4, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm7, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm5, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm11, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm8, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm12, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm11, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm5, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm18, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $24, %zmm10, %zmm2
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
+; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7]
-; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm22, %ymm6, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm27, %ymm6, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm27 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm3, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4],ymm5[5],ymm11[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,3,5,7,5,7,6,7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm24, %ymm12, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm23, %ymm12, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm4, %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm5, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm26, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm13, %zmm0, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm14 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm17 # 32-byte Folded Reload
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm17, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm31, %ymm6, %ymm16
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm21, %ymm6, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm15, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm13[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm28, %ymm12, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm25, %ymm12, %ymm18
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm13, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm18, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm8, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm14, %zmm6
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5,6],ymm10[7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm25, %ymm6, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [9,11,10,11,9,11,13,15]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm23, %zmm13, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm25 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm1, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5,6],ymm3[7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm29, %ymm6, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm16, %zmm13, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm11 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm3, %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm7[5],ymm5[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3,4,5],ymm4[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [9,11,13,15,13,15,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm23, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [1,3,5,7,5,7,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm22, %ymm16, %ymm5
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm15 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm22 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm24, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm9, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm30, %ymm6, %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm9 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm9, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm12[7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm26, %ymm6, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm13, %zmm13 # 64-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm12, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm13, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm11[5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm21, %zmm23, %zmm14
+; AVX512DQ-BW-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm16, %ymm16 # 32-byte Folded Reload
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm14, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm16, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm10, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm1, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm2, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm3, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm21 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm4, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm22 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm5, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm26, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm12, %zmm0, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm12 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm17, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5,6],ymm10[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm16, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm15, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm13, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm22, %xmm18, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm8, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm14 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm0, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm20 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm1, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm22 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm2, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm24 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm4, %xmm10
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm5, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm26, %zmm12
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm12, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm0, %zmm10
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm10 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm17, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5,6],ymm9[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm22, %ymm16, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5],ymm14[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm13, %xmm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm18, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm8, %zmm14
-; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm14, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm10, %zmm9
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm12 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm3, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm2, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm26 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm3, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm29 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm4, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm25 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm5, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm24, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm9, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm12, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm13, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm14, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm16, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $40, %zmm10, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm15 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm23 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm1, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7]
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm26 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm2, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} ymm29 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm3, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm31, %xmm4, %xmm8
+; AVX512DQ-BW-FCP-NEXT: vpbroadcastd {{.*#+}} xmm25 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm5, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1,2],xmm8[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm24, %zmm11
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm11, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm8, %zmm0, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm8 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm6
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm9, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm11[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm12, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm13, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm15[5],ymm11[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm31, %xmm14, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm16, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1,2],xmm11[3]
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm10, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm15, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm2, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm3, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm4, %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm5, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm4, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm18, %xmm5, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm26, %zmm2
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm24, %zmm2
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm2, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm17, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm16, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm30, %ymm15, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm7, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm9, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm12, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm13, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm13, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm18, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm14, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm18, %xmm16, %xmm3
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3]
-; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm8, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vpsrlq $56, %zmm10, %zmm3
; AVX512DQ-BW-FCP-NEXT: vpmovqb %zmm3, %xmm3
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
@@ -17446,16 +17534,16 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rdx)
; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%rcx)
-; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm1 # 64-byte Reload
+; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%r8)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%r9)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%r9)
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rax)
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax)
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
-; AVX512DQ-BW-FCP-NEXT: addq $264, %rsp # imm = 0x108
+; AVX512DQ-BW-FCP-NEXT: addq $360, %rsp # imm = 0x168
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%wide.vec = load <512 x i8>, ptr %in.vec, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 6d1ba933b9082..b64513df56946 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -1649,301 +1649,313 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX512-LABEL: store_i16_stride3_vf32:
; AVX512: # %bb.0:
+; AVX512-NEXT: vmovdqa64 (%rdx), %zmm4
; AVX512-NEXT: vmovdqa (%rsi), %xmm0
; AVX512-NEXT: vmovdqa 16(%rsi), %xmm3
; AVX512-NEXT: vmovdqa 32(%rsi), %xmm2
-; AVX512-NEXT: vprold $16, %xmm2, %xmm4
+; AVX512-NEXT: vprold $16, %xmm2, %xmm5
; AVX512-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm6
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm6
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm7
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT: vpshufb %xmm2, %xmm6, %xmm6
-; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX512-NEXT: vpshufb %xmm7, %xmm6, %xmm6
+; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm8, %xmm7, %xmm7
; AVX512-NEXT: vprold $16, %xmm3, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512-NEXT: vpermd (%rdx), %zmm4, %zmm5
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX512-NEXT: vmovdqa 32(%rsi), %ymm8
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512-NEXT: vpshufb %ymm9, %ymm8, %ymm8
-; AVX512-NEXT: vpor %ymm3, %ymm8, %ymm3
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm8
-; AVX512-NEXT: vmovdqa 48(%rsi), %xmm10
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512-NEXT: vpshufb %xmm7, %xmm11, %xmm7
-; AVX512-NEXT: vprold $16, %xmm10, %xmm10
-; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3]
-; AVX512-NEXT: vmovdqa (%rdx), %ymm7
-; AVX512-NEXT: vmovdqa 32(%rdx), %ymm8
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm11
-; AVX512-NEXT: vpermd %ymm8, %ymm4, %ymm4
-; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem)
-; AVX512-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX512-NEXT: vmovdqa (%rsi), %ymm6
-; AVX512-NEXT: vpshufb %ymm9, %ymm6, %ymm6
-; AVX512-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512-NEXT: vprold $16, %xmm0, %xmm6
+; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm3[0,1,2,3],zmm5[0,1,2,3]
+; AVX512-NEXT: vmovdqa (%rdx), %ymm3
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [5,5,0,6,6,0,7,7]
+; AVX512-NEXT: vpermd %ymm3, %ymm6, %ymm6
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,8,8,0,9,9,0,10]
+; AVX512-NEXT: vpermd %zmm4, %zmm7, %zmm7
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-NEXT: vmovdqa 32(%rsi), %ymm9
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512-NEXT: vpshufb %ymm10, %ymm9, %ymm9
+; AVX512-NEXT: vpor %ymm5, %ymm9, %ymm5
+; AVX512-NEXT: vmovdqa 48(%rdi), %xmm9
+; AVX512-NEXT: vmovdqa 48(%rsi), %xmm11
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512-NEXT: vpshufb %xmm8, %xmm12, %xmm8
+; AVX512-NEXT: vprold $16, %xmm11, %xmm11
+; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [10,0,11,11,0,12,12,0]
+; AVX512-NEXT: vpermd %zmm4, %zmm8, %zmm8
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [13,13,0,14,14,0,15,15]
+; AVX512-NEXT: vpermd %zmm4, %zmm9, %zmm4
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5))
+; AVX512-NEXT: vmovdqa (%rdi), %ymm5
+; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-NEXT: vmovdqa (%rsi), %ymm7
+; AVX512-NEXT: vpshufb %ymm10, %ymm7, %ymm7
+; AVX512-NEXT: vpor %ymm5, %ymm7, %ymm5
+; AVX512-NEXT: vprold $16, %xmm0, %xmm7
; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-NEXT: vpshufb %ymm10, %ymm7, %ymm1
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
-; AVX512-NEXT: vpermd %ymm7, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm3)
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,0,2]
+; AVX512-NEXT: vpermd %ymm3, %ymm1, %ymm1
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[10,11],zero,zero,zero,zero,ymm3[12,13],zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero,ymm3[18,19],zero,zero,zero,zero
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm2)
; AVX512-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512-NEXT: vmovdqa64 %zmm4, 128(%rcx)
-; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm6, 64(%rcx)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: store_i16_stride3_vf32:
; AVX512-FCP: # %bb.0:
+; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm4
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0
; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm3
; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm2
-; AVX512-FCP-NEXT: vprold $16, %xmm2, %xmm4
+; AVX512-FCP-NEXT: vprold $16, %xmm2, %xmm5
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm6
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm7
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
; AVX512-FCP-NEXT: vprold $16, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512-FCP-NEXT: vpermd (%rdx), %zmm4, %zmm5
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm8
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8
-; AVX512-FCP-NEXT: vpor %ymm3, %ymm8, %ymm3
-; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm8
-; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm10
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm7
-; AVX512-FCP-NEXT: vprold $16, %xmm10, %xmm10
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm7
-; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm8
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
-; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem)
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6
-; AVX512-FCP-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512-FCP-NEXT: vprold $16, %xmm0, %xmm6
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm3[0,1,2,3],zmm5[0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm3
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [5,5,0,6,6,0,7,7]
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm6
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,8,8,0,9,9,0,10]
+; AVX512-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm7
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm9
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9
+; AVX512-FCP-NEXT: vpor %ymm5, %ymm9, %ymm5
+; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm9
+; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm11
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm8
+; AVX512-FCP-NEXT: vprold $16, %xmm11, %xmm11
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [10,0,11,11,0,12,12,0]
+; AVX512-FCP-NEXT: vpermd %zmm4, %zmm8, %zmm8
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [13,13,0,14,14,0,15,15]
+; AVX512-FCP-NEXT: vpermd %zmm4, %zmm9, %zmm4
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5))
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm7
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vpor %ymm5, %ymm7, %ymm5
+; AVX512-FCP-NEXT: vprold $16, %xmm0, %xmm7
; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm1
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm3)
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,0,2]
+; AVX512-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[10,11],zero,zero,zero,zero,ymm3[12,13],zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero,ymm3[18,19],zero,zero,zero,zero
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm2)
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 128(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: store_i16_stride3_vf32:
; AVX512DQ: # %bb.0:
+; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm4
; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0
; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm3
; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm2
-; AVX512DQ-NEXT: vprold $16, %xmm2, %xmm4
+; AVX512DQ-NEXT: vprold $16, %xmm2, %xmm5
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm6
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm6
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm7
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT: vpshufb %xmm2, %xmm6, %xmm6
-; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm7, %xmm7
; AVX512DQ-NEXT: vprold $16, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512DQ-NEXT: vpermd (%rdx), %zmm4, %zmm5
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm8
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm8, %ymm8
-; AVX512DQ-NEXT: vpor %ymm3, %ymm8, %ymm3
-; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm8
-; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm10
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512DQ-NEXT: vpshufb %xmm7, %xmm11, %xmm7
-; AVX512DQ-NEXT: vprold $16, %xmm10, %xmm10
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3]
-; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm7
-; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm8
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm11
-; AVX512DQ-NEXT: vpermd %ymm8, %ymm4, %ymm4
-; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem)
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm6
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm6, %ymm6
-; AVX512DQ-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512DQ-NEXT: vprold $16, %xmm0, %xmm6
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm3[0,1,2,3],zmm5[0,1,2,3]
+; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm3
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [5,5,0,6,6,0,7,7]
+; AVX512DQ-NEXT: vpermd %ymm3, %ymm6, %ymm6
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,8,8,0,9,9,0,10]
+; AVX512DQ-NEXT: vpermd %zmm4, %zmm7, %zmm7
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm9
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512DQ-NEXT: vpshufb %ymm10, %ymm9, %ymm9
+; AVX512DQ-NEXT: vpor %ymm5, %ymm9, %ymm5
+; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm9
+; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm11
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512DQ-NEXT: vpshufb %xmm8, %xmm12, %xmm8
+; AVX512DQ-NEXT: vprold $16, %xmm11, %xmm11
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [10,0,11,11,0,12,12,0]
+; AVX512DQ-NEXT: vpermd %zmm4, %zmm8, %zmm8
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [13,13,0,14,14,0,15,15]
+; AVX512DQ-NEXT: vpermd %zmm4, %zmm9, %zmm4
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5))
+; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm5
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm7
+; AVX512DQ-NEXT: vpshufb %ymm10, %ymm7, %ymm7
+; AVX512DQ-NEXT: vpor %ymm5, %ymm7, %ymm5
+; AVX512DQ-NEXT: vprold $16, %xmm0, %xmm7
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm7, %ymm1
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
-; AVX512DQ-NEXT: vpermd %ymm7, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm3)
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,0,2]
+; AVX512DQ-NEXT: vpermd %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[10,11],zero,zero,zero,zero,ymm3[12,13],zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero,ymm3[18,19],zero,zero,zero,zero
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm2)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512DQ-NEXT: vmovdqa64 %zmm4, 128(%rcx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%rcx)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: store_i16_stride3_vf32:
; AVX512DQ-FCP: # %bb.0:
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm4
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm3
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm2
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm2, %xmm4
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm2, %xmm5
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm4[2],xmm7[3,4],xmm4[5],xmm7[6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
+; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm7
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm8[0,1],xmm5[2],xmm8[3,4],xmm5[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm7, %xmm7
; AVX512DQ-FCP-NEXT: vprold $16, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512DQ-FCP-NEXT: vpermd (%rdx), %zmm4, %zmm5
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm8, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm10
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm7
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2],xmm8[3,4],xmm10[5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm7[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
-; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm11, %zmm4
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm3 & mem)
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm0, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm3[2],xmm6[3,4],xmm3[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm3[0,1,2,3],zmm5[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm3
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [5,5,0,6,6,0,7,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm6
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,8,8,0,9,9,0,10]
+; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm9, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm9
+; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm11
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm12, %xmm8
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm11, %xmm11
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm11[2],xmm9[3,4],xmm11[5],xmm9[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [10,0,11,11,0,12,12,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [13,13,0,14,14,0,15,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm9, %zmm4
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm8, %zmm4
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm5))
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm0, %xmm7
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0,1],xmm6[2],xmm8[3,4],xmm6[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2],xmm8[3,4],xmm7[5],xmm8[6,7]
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm1
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,1,1,0,2]
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vpandn %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm3)
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,1,1,0,2]
+; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT: vpandn %ymm1, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,ymm3[10,11],zero,zero,zero,zero,ymm3[12,13],zero,zero,zero,zero,ymm3[14,15],zero,zero,zero,zero,ymm3[16,17],zero,zero,zero,zero,ymm3[18,19],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & zmm2)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 128(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -3069,143 +3081,151 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX512-LABEL: store_i16_stride3_vf64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa (%rsi), %ymm1
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm3
+; AVX512-NEXT: vmovdqa64 (%rdx), %zmm20
+; AVX512-NEXT: vmovdqa64 64(%rdx), %zmm15
+; AVX512-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX512-NEXT: vmovdqa (%rsi), %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm3
; AVX512-NEXT: vmovdqa (%rsi), %xmm5
-; AVX512-NEXT: vmovdqa64 16(%rsi), %xmm20
-; AVX512-NEXT: vmovdqa64 32(%rsi), %xmm24
-; AVX512-NEXT: vprold $16, %xmm5, %xmm8
-; AVX512-NEXT: vmovdqa (%rdi), %xmm9
-; AVX512-NEXT: vmovdqa64 16(%rdi), %xmm21
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm9
-; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-NEXT: vmovdqa (%rdx), %ymm3
-; AVX512-NEXT: vmovdqa 32(%rdx), %ymm8
-; AVX512-NEXT: vmovdqa 64(%rdx), %ymm14
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm11
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2]
-; AVX512-NEXT: vpermd %ymm3, %ymm16, %ymm3
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-NEXT: vpandn %ymm3, %ymm15, %ymm3
-; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm15)
+; AVX512-NEXT: vmovdqa64 16(%rsi), %xmm24
+; AVX512-NEXT: vmovdqa 32(%rsi), %xmm4
+; AVX512-NEXT: vprold $16, %xmm5, %xmm7
+; AVX512-NEXT: vmovdqa (%rdi), %xmm10
+; AVX512-NEXT: vmovdqa64 16(%rdi), %xmm25
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm6
+; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm7[2],xmm11[3,4],xmm7[5],xmm11[6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm5, %ymm5
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm5[0,1,2,3],zmm3[0,1,2,3]
+; AVX512-NEXT: vmovdqa (%rdx), %ymm5
+; AVX512-NEXT: vmovdqa 64(%rdx), %ymm12
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
+; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm3
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm27
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,1,1,0,2]
+; AVX512-NEXT: vpermd %ymm5, %ymm19, %ymm11
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-NEXT: vpandnq %ymm11, %ymm17, %ymm11
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm17)
; AVX512-NEXT: vmovdqa 96(%rsi), %xmm10
; AVX512-NEXT: vprold $16, %xmm10, %xmm11
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm12
-; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2]
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm14
+; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[1,1,2,2]
; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm10
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
+; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm10
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm26
; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512-NEXT: vmovdqa 80(%rdi), %xmm12
-; AVX512-NEXT: vmovdqa 80(%rsi), %xmm13
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX512-NEXT: vmovdqa 80(%rdi), %xmm13
+; AVX512-NEXT: vmovdqa 80(%rsi), %xmm14
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm0
-; AVX512-NEXT: vprold $16, %xmm13, %xmm13
-; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0
+; AVX512-NEXT: vprold $16, %xmm14, %xmm14
+; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0
; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512-NEXT: vpermd 64(%rdx), %zmm18, %zmm10
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0))
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm16 = [5,5,0,6,6,0,7,7]
+; AVX512-NEXT: vpermd %ymm12, %ymm16, %ymm10
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,8,8,0,9,9,0,10]
+; AVX512-NEXT: vpermd %zmm15, %zmm18, %zmm13
+; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm10
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm21 & (zmm10 ^ zmm0))
; AVX512-NEXT: vmovdqa 96(%rdi), %ymm0
-; AVX512-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa 96(%rsi), %ymm5
-; AVX512-NEXT: vmovdqa %ymm7, %ymm13
-; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm5
-; AVX512-NEXT: vpor %ymm0, %ymm5, %ymm0
-; AVX512-NEXT: vmovdqa 112(%rdi), %xmm5
-; AVX512-NEXT: vmovdqa 112(%rsi), %xmm12
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
+; AVX512-NEXT: vmovdqa %ymm8, %ymm1
+; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa 96(%rsi), %ymm13
+; AVX512-NEXT: vmovdqa %ymm9, %ymm2
+; AVX512-NEXT: vpshufb %ymm9, %ymm13, %ymm13
+; AVX512-NEXT: vpor %ymm0, %ymm13, %ymm0
+; AVX512-NEXT: vmovdqa 112(%rdi), %xmm13
+; AVX512-NEXT: vmovdqa 112(%rsi), %xmm14
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
; AVX512-NEXT: vpshufb %xmm11, %xmm7, %xmm7
-; AVX512-NEXT: vprold $16, %xmm12, %xmm12
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
-; AVX512-NEXT: vmovdqa 96(%rdx), %ymm5
-; AVX512-NEXT: vpermd %ymm5, %ymm18, %ymm7
-; AVX512-NEXT: vpandnq %ymm7, %ymm22, %ymm7
-; AVX512-NEXT: vpshufb %ymm9, %ymm5, %ymm5
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm17
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm0 & zmm19)
+; AVX512-NEXT: vprold $16, %xmm14, %xmm14
+; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[0,1,2,3]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm22 = [10,0,11,11,0,12,12,0]
+; AVX512-NEXT: vpermd %zmm15, %zmm22, %zmm13
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm23 = [13,13,0,14,14,0,15,15]
+; AVX512-NEXT: vpermd %zmm15, %zmm23, %zmm15
+; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm15
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm13 & (zmm15 ^ zmm0))
; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX512-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa 64(%rsi), %ymm7
-; AVX512-NEXT: vpshufb %ymm13, %ymm7, %ymm7
-; AVX512-NEXT: vpor %ymm0, %ymm7, %ymm0
-; AVX512-NEXT: vmovdqa 64(%rsi), %xmm7
-; AVX512-NEXT: vprold $16, %xmm7, %xmm12
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm5
-; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3]
-; AVX512-NEXT: vpshufb %ymm9, %ymm14, %ymm5
-; AVX512-NEXT: vpermd %ymm14, %ymm16, %ymm6
-; AVX512-NEXT: vpandn %ymm6, %ymm15, %ymm6
-; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & zmm15)
+; AVX512-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa 64(%rsi), %ymm8
+; AVX512-NEXT: vpshufb %ymm9, %ymm8, %ymm8
+; AVX512-NEXT: vpor %ymm0, %ymm8, %ymm0
+; AVX512-NEXT: vmovdqa 64(%rsi), %xmm8
+; AVX512-NEXT: vprold $16, %xmm8, %xmm7
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm14
+; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm14
+; AVX512-NEXT: vpshufb %xmm14, %xmm8, %xmm8
+; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[0,1,2,3]
+; AVX512-NEXT: vmovdqa64 %ymm27, %ymm7
+; AVX512-NEXT: vpshufb %ymm7, %ymm12, %ymm7
+; AVX512-NEXT: vpermd %ymm12, %ymm19, %ymm8
+; AVX512-NEXT: vpandnq %ymm8, %ymm17, %ymm8
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm0 & zmm17)
; AVX512-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa 32(%rsi), %ymm6
-; AVX512-NEXT: vpshufb %ymm13, %ymm6, %ymm6
-; AVX512-NEXT: vpor %ymm0, %ymm6, %ymm0
-; AVX512-NEXT: vmovdqa 48(%rdi), %xmm6
-; AVX512-NEXT: vmovdqa 48(%rsi), %xmm7
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqa 32(%rsi), %ymm8
+; AVX512-NEXT: vpshufb %ymm2, %ymm8, %ymm8
+; AVX512-NEXT: vpor %ymm0, %ymm8, %ymm0
+; AVX512-NEXT: vmovdqa 48(%rdi), %xmm8
+; AVX512-NEXT: vmovdqa 48(%rsi), %xmm9
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
; AVX512-NEXT: vpshufb %xmm11, %xmm12, %xmm12
-; AVX512-NEXT: vprold $16, %xmm7, %xmm7
-; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3]
-; AVX512-NEXT: vpermd %ymm8, %ymm18, %ymm6
-; AVX512-NEXT: vpandnq %ymm6, %ymm22, %ymm6
-; AVX512-NEXT: vpshufb %ymm9, %ymm8, %ymm7
-; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm0 & zmm19)
-; AVX512-NEXT: vmovdqa64 %xmm24, %xmm2
-; AVX512-NEXT: vprold $16, %xmm24, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm1
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; AVX512-NEXT: vpshufb %xmm11, %xmm2, %xmm2
-; AVX512-NEXT: vprold $16, %xmm20, %xmm4
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
-; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-NEXT: vprold $16, %xmm9, %xmm9
+; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm8
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-NEXT: vpermd %zmm20, %zmm22, %zmm8
+; AVX512-NEXT: vpermd %zmm20, %zmm23, %zmm9
+; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm13 & (zmm8 ^ zmm0))
+; AVX512-NEXT: vprold $16, %xmm4, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm0[2],xmm9[3,4],xmm0[5],xmm9[6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512-NEXT: vpshufb %xmm14, %xmm4, %xmm4
+; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm2
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512-NEXT: vpshufb %xmm11, %xmm4, %xmm4
+; AVX512-NEXT: vprold $16, %xmm24, %xmm1
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm25[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
-; AVX512-NEXT: vpermd (%rdx), %zmm18, %zmm1
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0))
+; AVX512-NEXT: vpermd %ymm5, %ymm16, %ymm1
+; AVX512-NEXT: vpermd %zmm20, %zmm18, %zmm2
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm0))
; AVX512-NEXT: vmovdqa64 %zmm1, 64(%rcx)
-; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rcx)
-; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rcx)
-; AVX512-NEXT: vmovdqa64 %zmm17, 320(%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm8, 128(%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm7, 192(%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm15, 320(%rcx)
; AVX512-NEXT: vmovdqa64 %zmm10, 256(%rcx)
; AVX512-NEXT: vmovdqa64 %zmm3, (%rcx)
; AVX512-NEXT: vzeroupper
@@ -3213,143 +3233,151 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX512-FCP-LABEL: store_i16_stride3_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 (%rdx), %zmm20
+; AVX512-FCP-NEXT: vmovdqa64 64(%rdx), %zmm15
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm2
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm3
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5
-; AVX512-FCP-NEXT: vmovdqa64 16(%rsi), %xmm20
-; AVX512-FCP-NEXT: vmovdqa64 32(%rsi), %xmm24
-; AVX512-FCP-NEXT: vprold $16, %xmm5, %xmm8
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9
-; AVX512-FCP-NEXT: vmovdqa64 16(%rdi), %xmm21
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm9
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm3
-; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm8
-; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm14
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm11
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2]
-; AVX512-FCP-NEXT: vpermd %ymm3, %ymm16, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vpandn %ymm3, %ymm15, %ymm3
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm15)
+; AVX512-FCP-NEXT: vmovdqa64 16(%rsi), %xmm24
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm4
+; AVX512-FCP-NEXT: vprold $16, %xmm5, %xmm7
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10
+; AVX512-FCP-NEXT: vmovdqa64 16(%rdi), %xmm25
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm7[2],xmm11[3,4],xmm7[5],xmm11[6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm5, %ymm5
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm5[0,1,2,3],zmm3[0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm5
+; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm12
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm27
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,1,1,0,2]
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm19, %ymm11
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vpandnq %ymm11, %ymm17, %ymm11
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm17)
; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm10
; AVX512-FCP-NEXT: vprold $16, %xmm10, %xmm11
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm12
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2]
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm14
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[1,1,2,2]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm10
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm10
+; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm26
; AVX512-FCP-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm12
-; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm13
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm13
+; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm14
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vprold $16, %xmm13, %xmm13
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0
+; AVX512-FCP-NEXT: vprold $16, %xmm14, %xmm14
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0
; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512-FCP-NEXT: vpermd 64(%rdx), %zmm18, %zmm10
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0))
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [5,5,0,6,6,0,7,7]
+; AVX512-FCP-NEXT: vpermd %ymm12, %ymm16, %ymm10
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,8,8,0,9,9,0,10]
+; AVX512-FCP-NEXT: vpermd %zmm15, %zmm18, %zmm13
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm10
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm21 & (zmm10 ^ zmm0))
; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm0
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm5
-; AVX512-FCP-NEXT: vmovdqa %ymm7, %ymm13
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0
-; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm5
-; AVX512-FCP-NEXT: vmovdqa 112(%rsi), %xmm12
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
+; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm13
+; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm13
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm13, %ymm0
+; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm13
+; AVX512-FCP-NEXT: vmovdqa 112(%rsi), %xmm14
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7
-; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm12
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm5
-; AVX512-FCP-NEXT: vpermd %ymm5, %ymm18, %ymm7
-; AVX512-FCP-NEXT: vpandnq %ymm7, %ymm22, %ymm7
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm17
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm0 & zmm19)
+; AVX512-FCP-NEXT: vprold $16, %xmm14, %xmm14
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[0,1,2,3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [10,0,11,11,0,12,12,0]
+; AVX512-FCP-NEXT: vpermd %zmm15, %zmm22, %zmm13
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [13,13,0,14,14,0,15,15]
+; AVX512-FCP-NEXT: vpermd %zmm15, %zmm23, %zmm15
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm15
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm13 & (zmm15 ^ zmm0))
; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm7
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0
-; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm7
-; AVX512-FCP-NEXT: vprold $16, %xmm7, %xmm12
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm5
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm5
-; AVX512-FCP-NEXT: vpermd %ymm14, %ymm16, %ymm6
-; AVX512-FCP-NEXT: vpandn %ymm6, %ymm15, %ymm6
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & zmm15)
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm8
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm8, %ymm0
+; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm8
+; AVX512-FCP-NEXT: vprold $16, %xmm8, %xmm7
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm14
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm14
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm7
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm7
+; AVX512-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm8
+; AVX512-FCP-NEXT: vpandnq %ymm8, %ymm17, %ymm8
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm0 & zmm17)
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm6, %ymm0
-; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
-; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm7
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm8
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm8, %ymm0
+; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm8
+; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm9
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12
-; AVX512-FCP-NEXT: vprold $16, %xmm7, %xmm7
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3]
-; AVX512-FCP-NEXT: vpermd %ymm8, %ymm18, %ymm6
-; AVX512-FCP-NEXT: vpandnq %ymm6, %ymm22, %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm0 & zmm19)
-; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm2
-; AVX512-FCP-NEXT: vprold $16, %xmm24, %xmm0
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm4
-; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm1
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vprold $16, %xmm20, %xmm4
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vprold $16, %xmm9, %xmm9
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm8
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm8[0,1,2,3]
+; AVX512-FCP-NEXT: vpermd %zmm20, %zmm22, %zmm8
+; AVX512-FCP-NEXT: vpermd %zmm20, %zmm23, %zmm9
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm13 & (zmm8 ^ zmm0))
+; AVX512-FCP-NEXT: vprold $16, %xmm4, %xmm0
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm0[2],xmm9[3,4],xmm0[5],xmm9[6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm2
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vprold $16, %xmm24, %xmm1
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm25[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
-; AVX512-FCP-NEXT: vpermd (%rdx), %zmm18, %zmm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0))
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm16, %ymm1
+; AVX512-FCP-NEXT: vpermd %zmm20, %zmm18, %zmm2
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm0))
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rcx)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 320(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%rcx)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 320(%rcx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 256(%rcx)
; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rcx)
; AVX512-FCP-NEXT: vzeroupper
@@ -3357,143 +3385,151 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX512DQ-LABEL: store_i16_stride3_vf64:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512DQ-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm3
+; AVX512DQ-NEXT: vmovdqa64 (%rdx), %zmm20
+; AVX512DQ-NEXT: vmovdqa64 64(%rdx), %zmm15
+; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm2
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm3
; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm5
-; AVX512DQ-NEXT: vmovdqa64 16(%rsi), %xmm20
-; AVX512DQ-NEXT: vmovdqa64 32(%rsi), %xmm24
-; AVX512DQ-NEXT: vprold $16, %xmm5, %xmm8
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9
-; AVX512DQ-NEXT: vmovdqa64 16(%rdi), %xmm21
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm9
-; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm3
-; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm8
-; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm14
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm11
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2]
-; AVX512DQ-NEXT: vpermd %ymm3, %ymm16, %ymm3
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT: vpandn %ymm3, %ymm15, %ymm3
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm15)
+; AVX512DQ-NEXT: vmovdqa64 16(%rsi), %xmm24
+; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm4
+; AVX512DQ-NEXT: vprold $16, %xmm5, %xmm7
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm10
+; AVX512DQ-NEXT: vmovdqa64 16(%rdi), %xmm25
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm6
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm7[2],xmm11[3,4],xmm7[5],xmm11[6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm5, %ymm5
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm5[0,1,2,3],zmm3[0,1,2,3]
+; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm5
+; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm12
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm3
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm27
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,1,1,0,2]
+; AVX512DQ-NEXT: vpermd %ymm5, %ymm19, %ymm11
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT: vpandnq %ymm11, %ymm17, %ymm11
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm17)
; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm10
; AVX512DQ-NEXT: vprold $16, %xmm10, %xmm11
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm12
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2]
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm14
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[1,1,2,2]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm10, %xmm10
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
+; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm10
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm26
; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm12
-; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm13
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm13
+; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm14
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm0
-; AVX512DQ-NEXT: vprold $16, %xmm13, %xmm13
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0
+; AVX512DQ-NEXT: vprold $16, %xmm14, %xmm14
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0
; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512DQ-NEXT: vpermd 64(%rdx), %zmm18, %zmm10
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0))
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm16 = [5,5,0,6,6,0,7,7]
+; AVX512DQ-NEXT: vpermd %ymm12, %ymm16, %ymm10
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,8,8,0,9,9,0,10]
+; AVX512DQ-NEXT: vpermd %zmm15, %zmm18, %zmm13
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm10
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm21 & (zmm10 ^ zmm0))
; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm5
-; AVX512DQ-NEXT: vmovdqa %ymm7, %ymm13
-; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm5
-; AVX512DQ-NEXT: vpor %ymm0, %ymm5, %ymm0
-; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm5
-; AVX512DQ-NEXT: vmovdqa 112(%rsi), %xmm12
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm13
+; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2
+; AVX512DQ-NEXT: vpshufb %ymm9, %ymm13, %ymm13
+; AVX512DQ-NEXT: vpor %ymm0, %ymm13, %ymm0
+; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm13
+; AVX512DQ-NEXT: vmovdqa 112(%rsi), %xmm14
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
; AVX512DQ-NEXT: vpshufb %xmm11, %xmm7, %xmm7
-; AVX512DQ-NEXT: vprold $16, %xmm12, %xmm12
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
-; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm5
-; AVX512DQ-NEXT: vpermd %ymm5, %ymm18, %ymm7
-; AVX512DQ-NEXT: vpandnq %ymm7, %ymm22, %ymm7
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm5, %ymm5
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm17
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm0 & zmm19)
+; AVX512DQ-NEXT: vprold $16, %xmm14, %xmm14
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[0,1,2,3]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm22 = [10,0,11,11,0,12,12,0]
+; AVX512DQ-NEXT: vpermd %zmm15, %zmm22, %zmm13
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm23 = [13,13,0,14,14,0,15,15]
+; AVX512DQ-NEXT: vpermd %zmm15, %zmm23, %zmm15
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm15
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm13 & (zmm15 ^ zmm0))
; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm7
-; AVX512DQ-NEXT: vpshufb %ymm13, %ymm7, %ymm7
-; AVX512DQ-NEXT: vpor %ymm0, %ymm7, %ymm0
-; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm7
-; AVX512DQ-NEXT: vprold $16, %xmm7, %xmm12
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm5
-; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3]
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm14, %ymm5
-; AVX512DQ-NEXT: vpermd %ymm14, %ymm16, %ymm6
-; AVX512DQ-NEXT: vpandn %ymm6, %ymm15, %ymm6
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & zmm15)
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm8
+; AVX512DQ-NEXT: vpshufb %ymm9, %ymm8, %ymm8
+; AVX512DQ-NEXT: vpor %ymm0, %ymm8, %ymm0
+; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm8
+; AVX512DQ-NEXT: vprold $16, %xmm8, %xmm7
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm14
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm8, %xmm8
+; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[0,1,2,3]
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm7
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm12, %ymm7
+; AVX512DQ-NEXT: vpermd %ymm12, %ymm19, %ymm8
+; AVX512DQ-NEXT: vpandnq %ymm8, %ymm17, %ymm8
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm0 & zmm17)
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm6
-; AVX512DQ-NEXT: vpshufb %ymm13, %ymm6, %ymm6
-; AVX512DQ-NEXT: vpor %ymm0, %ymm6, %ymm0
-; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm6
-; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm7
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm8
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm8, %ymm8
+; AVX512DQ-NEXT: vpor %ymm0, %ymm8, %ymm0
+; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm8
+; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm9
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
; AVX512DQ-NEXT: vpshufb %xmm11, %xmm12, %xmm12
-; AVX512DQ-NEXT: vprold $16, %xmm7, %xmm7
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3]
-; AVX512DQ-NEXT: vpermd %ymm8, %ymm18, %ymm6
-; AVX512DQ-NEXT: vpandnq %ymm6, %ymm22, %ymm6
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm8, %ymm7
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm0 & zmm19)
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-NEXT: vprold $16, %xmm24, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm1
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; AVX512DQ-NEXT: vpshufb %xmm11, %xmm2, %xmm2
-; AVX512DQ-NEXT: vprold $16, %xmm20, %xmm4
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT: vprold $16, %xmm9, %xmm9
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm8
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-NEXT: vpermd %zmm20, %zmm22, %zmm8
+; AVX512DQ-NEXT: vpermd %zmm20, %zmm23, %zmm9
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm13 & (zmm8 ^ zmm0))
+; AVX512DQ-NEXT: vprold $16, %xmm4, %xmm0
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm0[2],xmm9[3,4],xmm0[5],xmm9[6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm4, %xmm4
+; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm2
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512DQ-NEXT: vpshufb %xmm11, %xmm4, %xmm4
+; AVX512DQ-NEXT: vprold $16, %xmm24, %xmm1
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm25[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
-; AVX512DQ-NEXT: vpermd (%rdx), %zmm18, %zmm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT: vpermd %ymm5, %ymm16, %ymm1
+; AVX512DQ-NEXT: vpermd %zmm20, %zmm18, %zmm2
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm0))
; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%rcx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rcx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rcx)
-; AVX512DQ-NEXT: vmovdqa64 %zmm17, 320(%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm15, 320(%rcx)
; AVX512DQ-NEXT: vmovdqa64 %zmm10, 256(%rcx)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rcx)
; AVX512DQ-NEXT: vzeroupper
@@ -3501,143 +3537,151 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX512DQ-FCP-LABEL: store_i16_stride3_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdx), %zmm20
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdx), %zmm15
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128,20,21,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [10,11,0,1,128,128,12,13,2,3,128,128,14,15,4,5,128,128,16,17,28,29,128,128,18,19,18,19,128,128,20,21]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm3
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 16(%rsi), %xmm20
-; AVX512DQ-FCP-NEXT: vmovdqa64 32(%rsi), %xmm24
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm5, %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9
-; AVX512DQ-FCP-NEXT: vmovdqa64 16(%rdi), %xmm21
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm9[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1],xmm8[2],xmm10[3,4],xmm8[5],xmm10[6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm9
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm8[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm11
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,1,0,2]
-; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm16, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vpandn %ymm3, %ymm15, %ymm3
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm15)
+; AVX512DQ-FCP-NEXT: vmovdqa64 16(%rsi), %xmm24
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm4
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm5, %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 16(%rdi), %xmm25
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm7[2],xmm11[3,4],xmm7[5],xmm11[6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm5, %ymm5
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm5[0,1,2,3],zmm3[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm27
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,0,0,0,1,1,0,2]
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm19, %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT: vpandnq %ymm11, %ymm17, %ymm11
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm11, %zmm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm10 & zmm17)
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm10
; AVX512DQ-FCP-NEXT: vprold $16, %xmm10, %xmm11
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm14
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[1,1,2,2]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm26
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm13
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm13
+; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm14
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm13, %xmm13
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0
; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [5,5,0,6,6,0,7,7,0,8,8,0,9,9,0,10]
-; AVX512DQ-FCP-NEXT: vpermd 64(%rdx), %zmm18, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm22 & (zmm10 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [5,5,0,6,6,0,7,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm16, %ymm10
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,8,8,0,9,9,0,10]
+; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm18, %zmm13
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm10, %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm21 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm21 & (zmm10 ^ zmm0))
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa 112(%rsi), %xmm12
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm13
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm13
+; AVX512DQ-FCP-NEXT: vmovdqa 112(%rsi), %xmm14
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm12
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3,4],xmm12[5],xmm5[6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm5[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm5
-; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm18, %ymm7
-; AVX512DQ-FCP-NEXT: vpandnq %ymm7, %ymm22, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm17
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 | (zmm0 & zmm19)
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm14, %xmm14
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2],xmm13[3,4],xmm14[5],xmm13[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm13, %ymm7
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm7[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [10,0,11,11,0,12,12,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm22, %zmm13
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [13,13,0,14,14,0,15,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm23, %zmm15
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm13 & (zmm15 ^ zmm0))
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm7, %ymm7
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm7
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm7, %xmm12
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm12[2],xmm6[3,4],xmm12[5],xmm6[6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm5
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[0,1,2,3],zmm0[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm5
-; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm16, %ymm6
-; AVX512DQ-FCP-NEXT: vpandn %ymm6, %ymm15, %ymm6
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 | (zmm0 & zmm15)
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm8
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm8, %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm14
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm9[0,1],xmm7[2],xmm9[3,4],xmm7[5],xmm9[6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm14
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[0,1,2,3],zmm0[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm7
+; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm8
+; AVX512DQ-FCP-NEXT: vpandnq %ymm8, %ymm17, %ymm8
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 | (zmm0 & zmm17)
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm6, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm7
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm9
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm12, %xmm12
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm7, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3,4],xmm7[5],xmm6[6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm6[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm18, %ymm6
-; AVX512DQ-FCP-NEXT: vpandnq %ymm6, %ymm22, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm0 & zmm19)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm2
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm24, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm0[2],xmm7[3,4],xmm0[5],xmm7[6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm1
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm20, %xmm4
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2],xmm1[3,4],xmm4[5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3,4],xmm9[5],xmm8[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm12, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm8[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm22, %zmm8
+; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm23, %zmm9
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm13 & (zmm8 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm9[0,1],xmm0[2],xmm9[3,4],xmm0[5],xmm9[6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm2
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm24, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm25[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6,7]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermd (%rdx), %zmm18, %zmm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm22 & (zmm1 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm16, %ymm1
+; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm18, %zmm2
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm21 & (zmm1 ^ zmm0))
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 128(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rcx)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 320(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%rcx)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 320(%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 256(%rcx)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rcx)
; AVX512DQ-FCP-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 322d606538c54..a68684d948aac 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -349,27 +349,27 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2: # %bb.0:
; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm5
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
+; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm5
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
; AVX2-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
-; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
; AVX2-NEXT: vmovq %xmm0, 32(%r9)
-; AVX2-NEXT: vmovdqa %ymm2, (%r9)
+; AVX2-NEXT: vmovdqa %ymm1, (%r9)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -377,26 +377,26 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP: # %bb.0:
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX2-FP-NEXT: vpbroadcastq %xmm3, %ymm3
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm5
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX2-FP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm5
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-FP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
-; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX2-FP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
; AVX2-FP-NEXT: vmovq %xmm0, 32(%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm2, (%r9)
+; AVX2-FP-NEXT: vmovdqa %ymm1, (%r9)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
@@ -404,26 +404,26 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP: # %bb.0:
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
+; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm5
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535,65535,65535,65535,0,65535]
-; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX2-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovq %xmm0, 32(%r9)
-; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r9)
+; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r9)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
@@ -432,25 +432,25 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: movq (%r8), %rax
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512-NEXT: vpbroadcastq %rax, %ymm3
; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm5
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3))
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm5
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (mem & (ymm1 ^ ymm3))
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
; AVX512-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512-NEXT: vmovdqa %ymm1, (%r9)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
@@ -459,24 +459,24 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: movq (%r8), %rax
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512-FCP-NEXT: vpbroadcastq %rax, %ymm3
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3))
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm5
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (mem & (ymm1 ^ ymm3))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
; AVX512-FCP-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512-FCP-NEXT: vmovdqa %ymm1, (%r9)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -485,25 +485,25 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: movq (%r8), %rax
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512DQ-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512DQ-NEXT: vpbroadcastq %rax, %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm5
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3))
-; AVX512DQ-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,3,1,3,4,5,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm5
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (mem & (ymm1 ^ ymm3))
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,1,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
; AVX512DQ-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512DQ-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512DQ-NEXT: vmovdqa %ymm1, (%r9)
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
@@ -512,24 +512,24 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: movq (%r8), %rax
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vpbroadcastq %rax, %ymm3
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,8,9,0,1,8,9,u,u,2,3,10,11,2,3,10,11,u,u,4,5,12,13,4,5,12,13,u,u,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6],ymm2[7],ymm5[8,9],ymm2[10,11],ymm5[12,13,14],ymm2[15]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm2 = ymm3 ^ (mem & (ymm2 ^ ymm3))
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm5
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5,6],ymm1[7],ymm5[8,9],ymm1[10,11],ymm5[12,13,14],ymm1[15]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (mem & (ymm1 ^ ymm3))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,6,7,14,15,u,u,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3],xmm0[4,5,6,7]
; AVX512DQ-FCP-NEXT: vmovq %xmm0, 32(%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index 25bad7578c111..4524146afde6b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -404,10 +404,9 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -438,7 +437,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vpbroadcastq %xmm3, %ymm3
; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -471,7 +470,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -501,10 +500,9 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX512-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -538,7 +536,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7]
; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -569,10 +567,9 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX512DQ-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -606,7 +603,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,1,8,3,4,9,6,7]
; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm5, %ymm3
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,7,14,15,4,5,6,7,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15]
; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3]
@@ -819,8 +816,7 @@ define void @store_i16_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm7[0,0,1,1]
; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,1,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7]
+; AVX-NEXT: vinsertps {{.*#+}} xmm10 = xmm10[0,1],xmm9[0],xmm10[3]
; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm10, %ymm8
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,1,1]
@@ -1948,30 +1944,29 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-NEXT: vmovdqa (%rcx), %ymm5
; AVX512-NEXT: vmovdqa (%r8), %ymm1
; AVX512-NEXT: vmovdqa (%r9), %ymm3
+; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,0,14,6,0,15,7,0]
+; AVX512-NEXT: vpermi2d %ymm6, %ymm7, %ymm8
+; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,13,2,3,14,5,6,15]
+; AVX512-NEXT: vpermi2d %ymm6, %ymm8, %ymm7
; AVX512-NEXT: vpsrldq {{.*#+}} ymm6 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpsrldq {{.*#+}} ymm7 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7]
+; AVX512-NEXT: vpsrldq {{.*#+}} ymm8 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[2,1,2,3,6,5,6,7]
; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,5,12,0,4,13,0,7]
-; AVX512-NEXT: vpermi2d %ymm6, %ymm7, %ymm8
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,1,2,3,6,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7]
+; AVX512-NEXT: vpermi2d %ymm6, %ymm8, %ymm9
; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[1,2,2,3,5,6,6,7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [5,0,14,6,0,15,7,0]
-; AVX512-NEXT: vpermi2d %ymm7, %ymm8, %ymm9
-; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
-; AVX512-NEXT: vpermi2d %zmm9, %zmm7, %zmm8
-; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm16
+; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7]
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm16
; AVX512-NEXT: vmovdqa (%rcx), %xmm6
; AVX512-NEXT: vmovdqa (%rdx), %xmm7
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -2039,44 +2034,44 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10]
; AVX512-FCP-NEXT: vpermi2d %ymm9, %ymm11, %ymm12
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7]
; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm9
; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm11
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7]
-; AVX512-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm14
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512-FCP-NEXT: vpermi2d %zmm14, %zmm12, %zmm13
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm13
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm14
; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm12
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm14 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3]
-; AVX512-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm15
+; AVX512-FCP-NEXT: vpermi2d %ymm14, %ymm12, %ymm15
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [16,9,10,17,12,13,18,15]
; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3]
; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm13
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0
+; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm14
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm0
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0]
; AVX512-FCP-NEXT: vpermi2d %ymm8, %ymm6, %ymm7
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11]
-; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm8
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,17,2,3,18,5,6,19]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,12,0,5,13,0,6,14]
-; AVX512-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm9
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm10
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11]
-; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm6, %zmm7
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm6
+; AVX512-FCP-NEXT: vpermi2d %ymm7, %ymm8, %ymm9
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [8,9,20,11,12,21,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm11
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[8],ymm1[8],ymm11[9],ymm1[9],ymm11[10],ymm1[10],ymm11[11],ymm1[11]
+; AVX512-FCP-NEXT: vpermi2d %zmm9, %zmm7, %zmm8
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm8
; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm7
@@ -2086,20 +2081,20 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7]
; AVX512-FCP-NEXT: vpermi2d %ymm8, %ymm7, %ymm9
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7]
-; AVX512-FCP-NEXT: vpermi2d %ymm7, %ymm9, %ymm8
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [20,1,2,21,4,5,22,7]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm16[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11]
+; AVX512-FCP-NEXT: vpermi2d %zmm8, %zmm9, %zmm7
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,0,14,6,0,15,7,0]
; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm4
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm1[4],ymm11[5],ymm1[5],ymm11[6],ymm1[6],ymm11[7],ymm1[7],ymm11[12],ymm1[12],ymm11[13],ymm1[13],ymm11[14],ymm1[14],ymm11[15],ymm1[15]
; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm1
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm1
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
@@ -2115,30 +2110,29 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm5
; AVX512DQ-NEXT: vmovdqa (%r8), %ymm1
; AVX512DQ-NEXT: vmovdqa (%r9), %ymm3
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [5,0,14,6,0,15,7,0]
+; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm7, %ymm8
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,13,2,3,14,5,6,15]
+; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm8, %ymm7
; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm6 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm7 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[2,1,2,3,6,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT: vpsrldq {{.*#+}} ymm8 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm2[2,1,2,3,6,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,5,12,0,4,13,0,7]
-; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm7, %ymm8
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7]
+; AVX512DQ-NEXT: vpermi2d %ymm6, %ymm8, %ymm9
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [5,0,14,6,0,15,7,0]
-; AVX512DQ-NEXT: vpermi2d %ymm7, %ymm8, %ymm9
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm9 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15]
-; AVX512DQ-NEXT: vpermi2d %zmm9, %zmm7, %zmm8
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm16
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4,5],ymm6[6],ymm9[7]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm16
; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm6
; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm7
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -2206,44 +2200,44 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,8,0,1,9,0,2,10]
; AVX512DQ-FCP-NEXT: vpermi2d %ymm9, %ymm11, %ymm12
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7]
; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm9
; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm11
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,1,8,3,4,9,6,7]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm14
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm14, %zmm12, %zmm13
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm13
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm8, %xmm14
; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm10, %xmm12
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm14 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,1,8,0,0,9,0,3]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm13, %ymm12, %ymm15
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm14, %ymm12, %ymm15
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm12
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [16,9,10,17,12,13,18,15]
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[1,2,2,3]
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,2,2,3]
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm13
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm14, %zmm0
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm12, %zmm14
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm0
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,0,10,2,0,11,3,0]
; AVX512DQ-FCP-NEXT: vpermi2d %ymm8, %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm8
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,17,2,3,18,5,6,19]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm7, %zmm6
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,12,0,5,13,0,6,14]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm6, %ymm7, %ymm9
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm10
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11]
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm6, %zmm7
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm6
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm7, %ymm8, %ymm9
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm7
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [8,9,20,11,12,21,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm11
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm11[0],ymm1[0],ymm11[1],ymm1[1],ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[8],ymm1[8],ymm11[9],ymm1[9],ymm11[10],ymm1[10],ymm11[11],ymm1[11]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm9, %zmm7, %zmm8
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm8
; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm7
@@ -2253,20 +2247,20 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,5,12,0,4,13,0,7]
; AVX512DQ-FCP-NEXT: vpermi2d %ymm8, %ymm7, %ymm9
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [12,1,2,13,4,5,14,7]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm7, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [20,1,2,21,4,5,22,7]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm16[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[8],ymm8[8],ymm10[9],ymm8[9],ymm10[10],ymm8[10],ymm10[11],ymm8[11]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm8, %zmm9, %zmm7
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [5,0,14,6,0,15,7,0]
; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm3, %ymm4
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm2
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm10[4],ymm1[4],ymm10[5],ymm1[5],ymm10[6],ymm1[6],ymm10[7],ymm1[7],ymm10[12],ymm1[12],ymm10[13],ymm1[13],ymm10[14],ymm1[14],ymm10[15],ymm1[15]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm11[4],ymm1[4],ymm11[5],ymm1[5],ymm11[6],ymm1[6],ymm11[7],ymm1[7],ymm11[12],ymm1[12],ymm11[13],ymm1[13],ymm11[14],ymm1[14],ymm11[15],ymm1[15]
; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm3
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm8, %zmm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm7, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
@@ -4138,213 +4132,215 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX512-FCP-LABEL: store_i16_stride6_vf32:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $40, %rsp
+; AVX512-FCP-NEXT: subq $72, %rsp
; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm1
; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm3
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15]
+; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm2
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm22
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23]
-; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm2
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm3
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm23
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23]
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm2
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm4
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm0
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm3
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm25
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,1,2,3,11,11,11,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm26
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,2,3,11,11,11,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm0
; AVX512-FCP-NEXT: movw $18724, %ax # imm = 0x4924
; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1}
-; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm9
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [12,1,2,13,4,5,14,7]
-; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm3
-; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm6, %ymm3
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [8,21,10,11,20,13,14,23]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm8
-; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm24
-; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm7, %zmm2
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm5
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7]
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm21, %ymm3
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [8,21,10,11,20,13,14,23]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm4
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm27
+; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm20, %zmm0
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm9
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm8
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
-; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm10
-; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm27
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,2,2,3,10,9,10,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm10
-; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm11
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm12
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15]
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm10 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm28
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm11 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm29
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
-; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm4, %zmm10
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm15
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm4
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm14
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15]
-; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm0
-; AVX512-FCP-NEXT: vmovdqa32 %zmm10, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm5
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpermi2d %ymm4, %ymm0, %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm3
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm4
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,2,2,3,10,9,10,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm4
+; AVX512-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm5
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm6
+; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm15
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm15, %xmm4
+; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm28
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,1,0,1,10,10,10,10]
+; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm22
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm29
; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm30
-; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm21
-; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm11
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm16
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm11
-; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm2
-; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm4
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm31
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm18
-; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm3
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm13
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm1
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm4
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm3
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm4
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm16
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm17
+; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm1
+; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1}
+; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm4
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm1, %ymm21
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm31
+; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm20, %zmm1
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm24
+; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm2
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm14
+; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm13
+; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm12
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm4
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm0
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,2,1,8,9,8,9]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm5
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm6
; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492
; AVX512-FCP-NEXT: kmovw %eax, %k2
-; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k2}
-; AVX512-FCP-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15]
-; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm10
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm5
-; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2}
+; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [16,9,10,17,12,13,18,15]
+; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm2
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm21, %zmm6
+; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,1,8,3,4,9,6,7]
-; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm19, %ymm1
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm17
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm8
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm7
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm3
+; AVX512-FCP-NEXT: vpermt2d %ymm7, %ymm19, %ymm0
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm18
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm7
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm11 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm11
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm5
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm2
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm4
-; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k2}
-; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3
-; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm4, %ymm19
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm4
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm8
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm9
+; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k2}
+; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm8
+; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
+; AVX512-FCP-NEXT: vpermi2d %ymm5, %ymm9, %ymm19
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm9
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm5
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[0,0,2,1,4,5,6,7]
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,10,9]
-; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm9
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm2
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm19, %zmm19
-; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm4
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
+; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm11
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm19, %zmm19
+; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm5
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm9
+; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm10
+; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm15
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[8],ymm15[8],ymm10[9],ymm15[9],ymm10[10],ymm15[10],ymm10[11],ymm15[11]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,1,1,1,10,10,10,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm4, %zmm3
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm12 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm13, %zmm12
+; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm12 {%k1}
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,9,2,3,8,5,6,11]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm20
+; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm21, %ymm20
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,9,20,11,12,21,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm10
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm2, %zmm12
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm20, %zmm10
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm12
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm15
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11]
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [1,1,1,1,10,10,10,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm0
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm6
-; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm7
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm6
-; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1}
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm1
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,9,2,3,8,5,6,11]
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm8
-; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm3, %ymm8
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [8,9,20,11,12,21,14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm1
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm15, %zmm6
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm1
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,0,1,10,10,10,10]
-; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm8
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm14
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm14[0],ymm8[1],ymm14[1],ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[8],ymm14[8],ymm8[9],ymm14[9],ymm8[10],ymm14[10],ymm8[11],ymm14[11]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm12
-; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm8
-; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm13
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm14
-; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm13 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11]
-; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm7, %zmm13
-; AVX512-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1}
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0
-; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm13, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
+; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm4, %zmm0
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm4
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm6
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
+; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm13, %zmm4
+; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1}
+; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm15, %zmm13
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm0
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm3
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm1
+; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm4, %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm1
+; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm21, %ymm4
+; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm0))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm3 & (zmm22 ^ zmm10))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm4 ^ ymm1))
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm1 & (zmm9 ^ zmm19))
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm6))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm0 & (zmm4 ^ zmm19))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm17))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm0 & (zmm11 ^ zmm21))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax)
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax)
-; AVX512-FCP-NEXT: addq $40, %rsp
+; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm1 & (zmm11 ^ zmm18))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, 192(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm1 & (zmm14 ^ zmm24))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax)
+; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax)
+; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX512-FCP-NEXT: vmovdqa %ymm3, 64(%rax)
+; AVX512-FCP-NEXT: vmovdqa %ymm0, 96(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 256(%rax)
+; AVX512-FCP-NEXT: addq $72, %rsp
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -4574,213 +4570,215 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX512DQ-FCP-LABEL: store_i16_stride6_vf32:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $40, %rsp
+; AVX512DQ-FCP-NEXT: subq $72, %rsp
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm3
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm1[4],ymm3[5],ymm1[5],ymm3[6],ymm1[6],ymm3[7],ymm1[7],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm22
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm4, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm23
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm25
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,1,2,3,11,11,11,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm26
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,2,3,11,11,11,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm19, %zmm0
; AVX512DQ-FCP-NEXT: movw $18724, %ax # imm = 0x4924
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm9
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [12,1,2,13,4,5,14,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm3
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [8,21,10,11,20,13,14,23]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm24
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm7, %zmm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm5
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm21, %ymm3
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [8,21,10,11,20,13,14,23]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm27
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm20, %zmm0
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm9
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
-; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm27
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [2,2,2,3,10,9,10,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm9, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm12
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm12[4],ymm11[4],ymm12[5],ymm11[5],ymm12[6],ymm11[6],ymm12[7],ymm11[7],ymm12[12],ymm11[12],ymm12[13],ymm11[13],ymm12[14],ymm11[14],ymm12[15],ymm11[15]
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm10 = ymm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm11[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm28
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm11 = ymm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm12[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm29
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm4, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm15
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm14
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm10, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm5
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm4, %ymm0, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm3
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm4
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [2,2,2,3,10,9,10,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, (%rsp) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm15
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm15, %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm28
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,1,0,1,10,10,10,10]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm22
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm4 = ymm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm6[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm29
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm30
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm0
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm21
-; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm16
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm9, %zmm11
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm4
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm31
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm18
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm13
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm13, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm4
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm16
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm17
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm19, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm4
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm1, %ymm21
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm31
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm20, %zmm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm24
+; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm23
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm14
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm13
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm12
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm12[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,2,1,8,9,8,9]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm5
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm20, %zmm6
; AVX512DQ-FCP-NEXT: movw $9362, %ax # imm = 0x2492
; AVX512DQ-FCP-NEXT: kmovw %eax, %k2
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm5 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [16,9,10,17,12,13,18,15]
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm10
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm6, %zmm5
-; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm6 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [16,9,10,17,12,13,18,15]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm2
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm21, %zmm6
+; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm7 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,1,8,3,4,9,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm19, %ymm1
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm17
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm7
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm8[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm3 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm0, %zmm3
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm7, %ymm19, %ymm0
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm18
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm11 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm11
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm2
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm4 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3
-; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm4, %ymm19
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm6, %zmm4
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm5
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm9 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm8
+; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm5, %ymm9, %ymm19
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm21, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm15, %xmm5
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm15[0,0,2,1,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,0,0,0,8,8,10,9]
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm9
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm2
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm5
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm19, %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm6
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm20, %zmm4
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm20, %zmm11
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm19, %zmm19
+; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm10
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm20, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm15
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[2],ymm15[2],ymm10[3],ymm15[3],ymm10[8],ymm15[8],ymm10[9],ymm15[9],ymm10[10],ymm15[10],ymm10[11],ymm15[11]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [1,1,1,1,10,10,10,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm12 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm13, %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [0,9,2,3,8,5,6,11]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm20
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm21, %ymm20
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,9,20,11,12,21,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm10
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm2, %zmm12
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm20, %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm15
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[8],ymm15[8],ymm12[9],ymm15[9],ymm12[10],ymm15[10],ymm12[11],ymm15[11]
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [1,1,1,1,10,10,10,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm26, %zmm0
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm7
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[8],ymm6[8],ymm7[9],ymm6[9],ymm7[10],ymm6[10],ymm7[11],ymm6[11]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm7, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm6 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm1
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,9,2,3,8,5,6,11]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm8
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm3, %ymm8
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [8,9,20,11,12,21,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm15, %zmm6
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm8, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm1
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [0,1,0,1,10,10,10,10]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm14
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm8[0],ymm14[0],ymm8[1],ymm14[1],ymm8[2],ymm14[2],ymm8[3],ymm14[3],ymm8[8],ymm14[8],ymm8[9],ymm14[9],ymm8[10],ymm14[10],ymm8[11],ymm14[11]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm26, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm13
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm14
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm13 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[2],mem[2],ymm14[3],mem[3],ymm14[8],mem[8],ymm14[9],mem[9],ymm14[10],mem[10],ymm14[11],mem[11]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm7, %zmm13
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm12, %zmm13 {%k1}
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm13, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm6
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[8],ymm6[8],ymm4[9],ymm6[9],ymm4[10],ymm6[10],ymm4[11],ymm6[11]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm13, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm4 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm15, %zmm13
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm3
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm1
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm4, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm1
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm21, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm1
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm3 & (zmm1 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 ^ (zmm3 & (zmm22 ^ zmm10))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm1 ^ (ymm3 & (ymm4 ^ ymm1))
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm1 & (zmm9 ^ zmm19))
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm6))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm0 & (zmm4 ^ zmm19))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm17))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm0 & (zmm11 ^ zmm21))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 128(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm1 = zmm1 ^ (zmm0 & (zmm1 ^ mem))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax)
-; AVX512DQ-FCP-NEXT: addq $40, %rsp
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rax)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm1 & (zmm11 ^ zmm18))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, 192(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm1 & (zmm14 ^ zmm24))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 128(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ mem))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax)
+; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, 64(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, 96(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 256(%rax)
+; AVX512DQ-FCP-NEXT: addq $72, %rsp
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -8749,417 +8747,426 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX512-FCP-LABEL: store_i16_stride6_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $1320, %rsp # imm = 0x528
-; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm1
-; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm2
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512-FCP-NEXT: subq $1448, %rsp # imm = 0x5A8
+; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm4
+; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm13
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm4[4],ymm13[5],ymm4[5],ymm13[6],ymm4[6],ymm13[7],ymm4[7],ymm13[12],ymm4[12],ymm13[13],ymm4[13],ymm13[14],ymm4[14],ymm13[15],ymm4[15]
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23]
-; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm22, %zmm1
-; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm4
-; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm2
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm5
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm22, %zmm2
+; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm5
; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm6
+; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm3
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,1,2,3,11,11,11,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm2
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0
; AVX512-FCP-NEXT: movw $18724, %ax # imm = 0x4924
; AVX512-FCP-NEXT: kmovw %eax, %k1
-; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1}
-; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm4
-; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm5
+; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7]
-; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm3
-; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm21, %ymm3
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm3
+; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm21, %ymm3
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [8,21,10,11,20,13,14,23]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm1
-; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm24, %zmm2
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm2
+; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm24, %zmm0
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm2
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
-; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [2,2,2,3,10,9,10,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm2
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2
; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm2
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm3
; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm22, %zmm2
-; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm4
-; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm1
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm5
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm22, %zmm2
+; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm5
; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm1
-; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512-FCP-NEXT: vmovdqa %ymm1, %ymm2
-; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm4
-; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
+; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm3
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0
+; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm5
+; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm21, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm3
-; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm1
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm3
+; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm0
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm2
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm2
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2
; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm3
; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm4
-; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm5
+; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm26, %zmm2
-; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm4
-; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm1
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm2
+; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm5
; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm3
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm0
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
+; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm3
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,2,1,8,9,8,9]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm3
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm3
; AVX512-FCP-NEXT: movw $9362, %ax # imm = 0x2492
; AVX512-FCP-NEXT: kmovw %eax, %k2
; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2}
-; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm1
+; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm0
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15]
-; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm4
-; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm5
+; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm3
-; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm29 = [0,1,8,3,4,9,6,7]
-; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm1
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm0
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,8,8,10,9]
; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm2
; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm1
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm0
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm2
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm2
; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm3
; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm4
-; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm26, %zmm2
-; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm4
-; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm1
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm5
+; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm2
+; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm5
; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm3
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm3
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm0
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
+; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm3
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm3
; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2}
-; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm1
-; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm4
-; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa %ymm3, %ymm0
+; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm5
+; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm3
-; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
-; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm1
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm0
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm2
; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm0
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm2
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm2
; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm3
; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm4
-; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm22, %zmm1
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm2
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm5
; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm22, %zmm0
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm5
+; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm2
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
+; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm3
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm31
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm2
-; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1}
-; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm1
-; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm4
-; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm21, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm3
+; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm5
+; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpermt2d %ymm3, %ymm21, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm3
; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm2
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm2
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm2
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm3
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm5
+; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm22, %zmm0
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm20
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm31
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm1
+; AVX512-FCP-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2
+; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm21
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm0
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm21
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2
; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm14
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm13
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15]
-; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm22, %zmm1
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm10
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm2
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm9
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15]
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0
-; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm12
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm21
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm1
-; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm24, %zmm0
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm21
-; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm11
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm1
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm1
-; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm2
; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm3
; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm20
+; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm18
; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm19
; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm1
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm3
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm0
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm2
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm15
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm2
; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm18
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2
; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2}
; AVX512-FCP-NEXT: vmovdqa %ymm2, %ymm0
-; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm3
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm11
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,1,3,3,4,5,6,7]
; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm2
-; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm16
+; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm29, %ymm0
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm22
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm6
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm5
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm3
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm2
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm4
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm7
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm7
-; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm7 {%k2}
-; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
-; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512-FCP-NEXT: vpermi2d %ymm3, %ymm7, %ymm29
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm7
-; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm15
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm3
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[0,0,2,1,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm8
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm29, %zmm23
-; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm16
+; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm17
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm1
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm9
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm8
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2
+; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2}
+; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm14
+; AVX512-FCP-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero
+; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm2, %ymm29
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[2,1,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm2
+; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm3
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm0
; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,0,2,1,4,5,6,7]
; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm7
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,1,1,1,10,10,10,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm1
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm4
-; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1}
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,9,2,3,8,5,6,11]
-; AVX512-FCP-NEXT: vmovdqa %ymm4, %ymm1
-; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm5, %ymm1
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [8,9,20,11,12,21,14,15]
-; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm26
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,0,1,10,10,10,10]
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm29, %zmm23
+; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm10
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm0
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,0,2,1,4,5,6,7]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm5
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
-; AVX512-FCP-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm3
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [1,1,1,1,10,10,10,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm1
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512-FCP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm6 = ymm6[0],mem[0],ymm6[1],mem[1],ymm6[2],mem[2],ymm6[3],mem[3],ymm6[8],mem[8],ymm6[9],mem[9],ymm6[10],mem[10],ymm6[11],mem[11]
-; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm6
-; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1}
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[8],ymm4[8],ymm13[9],ymm4[9],ymm13[10],ymm4[10],ymm13[11],ymm4[11]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm13
+; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm13 {%k1}
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovdqa %ymm6, %ymm3
-; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm5, %ymm3
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm29
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,9,2,3,8,5,6,11]
+; AVX512-FCP-NEXT: vmovdqa %ymm13, %ymm1
+; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm24, %ymm1
; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm27
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [8,9,20,11,12,21,14,15]
+; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm13
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm25
; AVX512-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm0 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm6
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm6
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm4
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,0,1,10,10,10,10]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm4
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm3
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm1
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[2],mem[2],ymm10[3],mem[3],ymm10[8],mem[8],ymm10[9],mem[9],ymm10[10],mem[10],ymm10[11],mem[11]
-; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm10
-; AVX512-FCP-NEXT: vmovdqa32 %zmm3, %zmm10 {%k1}
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovdqa %ymm10, %ymm3
-; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm5, %ymm3
-; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm10
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3
-; AVX512-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
+; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm0
+; AVX512-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm13
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm24, %ymm2
+; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm27, %zmm0
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm29
+; AVX512-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm1 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm11
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm11
-; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm12
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm11
-; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm10
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm1
; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm12
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
; AVX512-FCP-NEXT: # ymm12 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11]
-; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm12
-; AVX512-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1}
-; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm10
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm9
-; AVX512-FCP-NEXT: vpermi2d %ymm9, %ymm12, %ymm5
-; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm12
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm4
-; AVX512-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm9 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm4
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm5))
+; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm12
+; AVX512-FCP-NEXT: vmovdqa32 %zmm2, %zmm12 {%k1}
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm1
+; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm2
+; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm24, %ymm2
+; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm27, %zmm12
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm1
+; AVX512-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm11
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[8],ymm11[8],ymm2[9],ymm11[9],ymm2[10],ymm11[10],ymm2[11],ymm11[11]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm26, %zmm8
+; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm9
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm9 = ymm9[0],mem[0],ymm9[1],mem[1],ymm9[2],mem[2],ymm9[3],mem[3],ymm9[8],mem[8],ymm9[9],mem[9],ymm9[10],mem[10],ymm9[11],mem[11]
+; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm9
+; AVX512-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1}
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm2
+; AVX512-FCP-NEXT: vpermi2d %ymm2, %ymm9, %ymm24
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm2
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm6 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm27, %zmm9
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm6
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
+; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm9))
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm8 & (zmm3 ^ zmm1))
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm3))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax)
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm8 & (zmm0 ^ zmm29))
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 448(%rax)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm1 & (zmm6 ^ zmm27))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 640(%rax)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm26))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm8 & (zmm4 ^ zmm25))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 640(%rax)
; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm23))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm22))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax)
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm23))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm22))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax)
; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm21))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax)
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 320(%rax)
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 384(%rax)
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 576(%rax)
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm3 = zmm3 ^ (zmm1 & (zmm3 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax)
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm3 = zmm3 ^ (zmm1 & (zmm3 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 320(%rax)
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 384(%rax)
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 576(%rax)
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; AVX512-FCP-NEXT: # zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ mem))
@@ -9168,7 +9175,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; AVX512-FCP-NEXT: # zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ mem))
; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 704(%rax)
-; AVX512-FCP-NEXT: addq $1320, %rsp # imm = 0x528
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (ymm8 & (ymm24 ^ ymm2))
+; AVX512-FCP-NEXT: vmovdqa %ymm6, 96(%rax)
+; AVX512-FCP-NEXT: vmovdqa %ymm8, 64(%rax)
+; AVX512-FCP-NEXT: addq $1448, %rsp # imm = 0x5A8
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -9658,417 +9668,426 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX512DQ-FCP-LABEL: store_i16_stride6_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $1320, %rsp # imm = 0x528
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15]
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm1[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
+; AVX512DQ-FCP-NEXT: subq $1448, %rsp # imm = 0x5A8
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm13
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm13[4],ymm4[4],ymm13[5],ymm4[5],ymm13[6],ymm4[6],ymm13[7],ymm4[7],ymm13[12],ymm4[12],ymm13[13],ymm4[13],ymm13[14],ymm4[14],ymm13[15],ymm4[15]
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,5,4,5,4,5,4,5,21,22,21,22,21,22,23,23]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm22, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm22, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm5
; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [2,1,2,3,11,11,11,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm2
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0
; AVX512DQ-FCP-NEXT: movw $18724, %ax # imm = 0x4924
; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [12,1,2,13,4,5,14,7]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm3
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm21, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm3
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm21, %ymm3
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [8,21,10,11,20,13,14,23]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm24, %zmm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm2
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm24, %zmm0
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
-; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
+; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [2,2,2,3,10,9,10,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm2
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm3
; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15]
; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm2[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm3 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm22, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm22, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm5
; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm3[0],ymm1[0],ymm3[1],ymm1[1],ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm1 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm21, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm1
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm0
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm3
; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [1,0,2,2,1,0,2,2,16,17,16,17,16,17,16,17]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm26, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm5
; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,6,7,4,5,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm3
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,2,1,8,9,8,9]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm3
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm3
; AVX512DQ-FCP-NEXT: movw $9362, %ax # imm = 0x2492
; AVX512DQ-FCP-NEXT: kmovw %eax, %k2
; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm0
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm28 = [16,9,10,17,12,13,18,15]
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm29 = [0,1,8,3,4,9,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm1
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm0
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,0,0,0,8,8,10,9]
; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm2
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm0
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm2
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm2
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm3
; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm26, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm2 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm5
; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm5, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm27, %zmm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm3
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm3
; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm3 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[2,1,3,3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm1
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm29, %ymm0
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm2
; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm2, %xmm0
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm2
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm2
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm3
; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm22, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm5
; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm22, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm3
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm5[4],ymm4[4],ymm5[5],ymm4[5],ymm5[6],ymm4[6],ymm5[7],ymm4[7],ymm5[12],ymm4[12],ymm5[13],ymm4[13],ymm5[14],ymm4[14],ymm5[15],ymm4[15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm31
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15]
; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm21, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm3, %ymm21, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm3
; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm24, %zmm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm2
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm0 = ymm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm3[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm22, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm20
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm31
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm21
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm24, %zmm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm21
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm2
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm13
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm1 = ymm14[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm14[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} ymm2 = ymm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm13[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm14[4],ymm13[5],ymm14[5],ymm13[6],ymm14[6],ymm13[7],ymm14[7],ymm13[12],ymm14[12],ymm13[13],ymm14[13],ymm13[14],ymm14[14],ymm13[15],ymm14[15]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm22, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm10
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm10[4],ymm9[5],ymm10[5],ymm9[6],ymm10[6],ymm9[7],ymm10[7],ymm9[12],ymm10[12],ymm9[13],ymm10[13],ymm9[14],ymm10[14],ymm9[15],ymm10[15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm23, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm12
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[2,1,3,3,4,5,6,7,10,9,11,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm1
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm24, %zmm0
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm21, %zmm21
-; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm11, %ymm1
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm25, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm2
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm3
; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm20
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm18
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm19
; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm15
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm12, %xmm2
; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm18
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3]
; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2
; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2}
; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm3
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm3[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm11
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,1,3,3,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm2
-; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16
+; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero
; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm29, %ymm0
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm22
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm5
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm3
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm0 = xmm2[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm2
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm4, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm7
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm27, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm7 {%k2}
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
-; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm3, %ymm7, %ymm29
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm28, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm15
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm3
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm15[0,0,2,1,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm8
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm29, %zmm23
-; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm17
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm26, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm9
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm2
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm27, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2}
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm14
+; AVX512DQ-FCP-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm2, %ymm29
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[2,1,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm0
; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,0,2,1,4,5,6,7]
; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm7
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm9[0],ymm10[0],ymm9[1],ymm10[1],ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[8],ymm10[8],ymm9[9],ymm10[9],ymm9[10],ymm10[10],ymm9[11],ymm10[11]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [1,1,1,1,10,10,10,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm1
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm13[0],ymm14[0],ymm13[1],ymm14[1],ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[8],ymm14[8],ymm13[9],ymm14[9],ymm13[10],ymm14[10],ymm13[11],ymm14[11]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm4 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm2, %xmm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,9,2,3,8,5,6,11]
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [8,9,20,11,12,21,14,15]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm4
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm26
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm2
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,1,0,1,10,10,10,10]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm29, %zmm23
+; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm10
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm0
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,0,2,1,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm5
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [1,1,1,1,10,10,10,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm6 = ymm6[0],mem[0],ymm6[1],mem[1],ymm6[2],mem[2],ymm6[3],mem[3],ymm6[8],mem[8],ymm6[9],mem[9],ymm6[10],mem[10],ymm6[11],mem[11]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm6 {%k1}
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[8],ymm4[8],ymm13[9],ymm4[9],ymm13[10],ymm4[10],ymm13[11],ymm4[11]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm28 = [17,18,17,18,0,0,19,19,5,4,2,2,5,4,6,6]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm13
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm13 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, %ymm3
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm5, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm29
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,9,2,3,8,5,6,11]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm24, %ymm1
; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm6
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm27
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [8,9,20,11,12,21,14,15]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm27, %zmm13
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm1, %zmm25
; AVX512DQ-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm3, %xmm6
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm6
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm4
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,0,1,10,10,10,10]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm4
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm26, %zmm1
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[2],mem[2],ymm10[3],mem[3],ymm10[8],mem[8],ymm10[9],mem[9],ymm10[10],mem[10],ymm10[11],mem[11]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm28, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm3, %zmm10 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, %ymm3
-; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm25, %zmm10
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[2],mem[2],ymm0[3],mem[3],ymm0[8],mem[8],ymm0[9],mem[9],ymm0[10],mem[10],ymm0[11],mem[11]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm13
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm2
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm24, %ymm2
+; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm27, %zmm0
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm29
+; AVX512DQ-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[8],ymm11[8],ymm10[9],ymm11[9],ymm10[10],ymm11[10],ymm10[11],ymm11[11]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm12
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm24, %zmm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm10
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[2],mem[2],ymm1[3],mem[3],ymm1[8],mem[8],ymm1[9],mem[9],ymm1[10],mem[10],ymm1[11],mem[11]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm26, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm12
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: # ymm12 = ymm12[0],mem[0],ymm12[1],mem[1],ymm12[2],mem[2],ymm12[3],mem[3],ymm12[8],mem[8],ymm12[9],mem[9],ymm12[10],mem[10],ymm12[11],mem[11]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm28, %zmm12
-; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm11, %zmm12 {%k1}
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm10
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm9
-; AVX512DQ-FCP-NEXT: vpermi2d %ymm9, %ymm12, %ymm5
-; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm25, %zmm12
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm15, %xmm4
-; AVX512DQ-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm5))
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm28, %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm2, %zmm12 {%k1}
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm2
+; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm24, %ymm2
+; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm27, %zmm12
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm1
+; AVX512DQ-FCP-NEXT: vpshuflw $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm11
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm11[0],ymm2[1],ymm11[1],ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[8],ymm11[8],ymm2[9],ymm11[9],ymm2[10],ymm11[10],ymm2[11],ymm11[11]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm26, %zmm8
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm9
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm9 = ymm9[0],mem[0],ymm9[1],mem[1],ymm9[2],mem[2],ymm9[3],mem[3],ymm9[8],mem[8],ymm9[9],mem[9],ymm9[10],mem[10],ymm9[11],mem[11]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm28, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqa32 %zmm8, %zmm9 {%k1}
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm14, %xmm2
+; AVX512DQ-FCP-NEXT: vpermi2d %ymm2, %ymm9, %ymm24
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm2
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm27, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm6
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
+; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm9))
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm8 & (zmm3 ^ zmm1))
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 256(%rax)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm8 & (zmm0 ^ zmm29))
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 448(%rax)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (zmm1 & (zmm6 ^ zmm27))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 640(%rax)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm26))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm8 & (zmm4 ^ zmm25))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 640(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm23))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm22))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 192(%rax)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm0 & (zmm5 ^ zmm23))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm0 & (zmm7 ^ zmm22))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ zmm21))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm2 = zmm2 ^ (zmm1 & (zmm2 ^ mem))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 320(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 384(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ mem))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 576(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm3 = zmm3 ^ (zmm1 & (zmm3 ^ mem))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm3 = zmm3 ^ (zmm1 & (zmm3 ^ mem))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 320(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ mem))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 384(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm3 = zmm3 ^ (zmm0 & (zmm3 ^ mem))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 576(%rax)
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; AVX512DQ-FCP-NEXT: # zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ mem))
@@ -10077,7 +10096,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
; AVX512DQ-FCP-NEXT: # zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ mem))
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 704(%rax)
-; AVX512DQ-FCP-NEXT: addq $1320, %rsp # imm = 0x528
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm2 ^ (ymm8 & (ymm24 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, 96(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, 64(%rax)
+; AVX512DQ-FCP-NEXT: addq $1448, %rsp # imm = 0x5A8
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 5aa7c055d408e..003dab11a85ba 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -441,48 +441,48 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[0,1,1,3]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,1,2,1]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6],xmm4[7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7]
-; AVX-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm6[1,1,1,1]
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4],xmm3[5,6,7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm6[3],xmm4[4,5,6,7]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1,2,3,4,5,6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm1[0,1,8,9,u,u,u,u,u,u,u,u,2,3,2,3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[0,1,0,2,4,5,6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5,6,7]
-; AVX-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm7[1,2,3],xmm0[4],xmm7[5,6,7]
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero
-; AVX-NEXT: vpackusdw %xmm8, %xmm7, %xmm7
-; AVX-NEXT: vpackusdw %xmm7, %xmm7, %xmm7
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm7[4,5,6],xmm5[7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,2,3,6,7,u,u,u,u,u,u,4,5,12,13]
-; AVX-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
+; AVX-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm0[0],xmm1[0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[0,1,1,3]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[3,1,2,1]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[2,0,2,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3,4],xmm5[5,6],xmm6[7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7]
+; AVX-NEXT: vmovddup {{.*#+}} xmm8 = mem[0,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm9 = xmm8[1,1,1,1]
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm9[0]
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3,4],xmm5[5,6,7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,1,3,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm7 = xmm3[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1,2,3,4,5,6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[0,1,0,2,4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7]
+; AVX-NEXT: vpxor %xmm9, %xmm9, %xmm9
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm9[1,2,3],xmm4[4],xmm9[5,6,7]
+; AVX-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
+; AVX-NEXT: vpackusdw %xmm9, %xmm4, %xmm4
+; AVX-NEXT: vpackusdw %xmm4, %xmm4, %xmm4
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6],xmm7[7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4,5,6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[10,11,2,3,6,7,u,u,u,u,u,u,4,5,12,13]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm6[5],xmm0[6,7]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5],xmm1[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm8[5],xmm0[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4,5],xmm2[6,7]
; AVX-NEXT: vmovdqa %xmm0, 16(%rax)
-; AVX-NEXT: vmovdqa %xmm5, (%rax)
-; AVX-NEXT: vmovq %xmm4, 48(%rax)
-; AVX-NEXT: vmovdqa %xmm3, 32(%rax)
+; AVX-NEXT: vmovdqa %xmm4, (%rax)
+; AVX-NEXT: vmovq %xmm6, 48(%rax)
+; AVX-NEXT: vmovdqa %xmm5, 32(%rax)
; AVX-NEXT: retq
;
; AVX2-LABEL: store_i16_stride7_vf4:
@@ -2197,8 +2197,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[0,1,0,1]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm12[0,0,0,0]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm12[0]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
@@ -2222,8 +2221,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,2,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6,7]
+; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm12[2],xmm3[2,3]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
; AVX-NEXT: vandnps %ymm1, %ymm3, %ymm1
@@ -4283,21 +4281,22 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX-LABEL: store_i16_stride7_vf32:
; AVX: # %bb.0:
-; AVX-NEXT: subq $584, %rsp # imm = 0x248
+; AVX-NEXT: subq $568, %rsp # imm = 0x238
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovdqa 16(%rsi), %xmm12
+; AVX-NEXT: vmovdqa 16(%rsi), %xmm9
+; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 48(%rsi), %xmm1
+; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm12
; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa 48(%rsi), %xmm9
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm10
-; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm2
+; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm2
; AVX-NEXT: vmovdqa 48(%rdx), %xmm4
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[2,2,2,2]
; AVX-NEXT: vmovdqa 48(%rcx), %xmm3
@@ -4307,14 +4306,14 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; AVX-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9]
; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX-NEXT: vandps %ymm5, %ymm11, %ymm5
+; AVX-NEXT: vandps %ymm5, %ymm10, %ymm5
; AVX-NEXT: vorps %ymm2, %ymm5, %ymm6
; AVX-NEXT: vextractf128 $1, %ymm6, %xmm5
; AVX-NEXT: vmovdqa 48(%r8), %xmm2
; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[2,2,2,2]
; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm5[1,2,3,4,5,6],xmm7[7]
-; AVX-NEXT: vmovdqa 48(%r9), %xmm13
-; AVX-NEXT: vpsrldq {{.*#+}} xmm8 = xmm13[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vmovdqa 48(%r9), %xmm11
+; AVX-NEXT: vpsrldq {{.*#+}} xmm8 = xmm11[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm8[1],xmm7[2,3,4,5,6,7]
; AVX-NEXT: vmovdqa 48(%rax), %xmm5
; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,3,3]
@@ -4322,17 +4321,17 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm2[1,1,1,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5,6,7]
-; AVX-NEXT: vpsrld $16, %xmm13, %xmm7
+; AVX-NEXT: vpsrld $16, %xmm11, %xmm7
; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7]
; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpsrlq $48, %xmm3, %xmm6
; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm4[1],xmm6[1]
; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX-NEXT: vandnps %ymm1, %ymm11, %ymm1
+; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm1
; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,2,3,3]
; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
-; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0
+; AVX-NEXT: vandps %ymm0, %ymm10, %ymm0
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,2,3,3]
@@ -4342,13 +4341,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1,2,3,4,5,6],xmm6[7]
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm2[6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,5,6,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,5,6,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6],xmm1[7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
@@ -4365,21 +4364,21 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9]
; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX-NEXT: vandnps %ymm0, %ymm11, %ymm0
-; AVX-NEXT: vandps %ymm1, %ymm11, %ymm1
+; AVX-NEXT: vandnps %ymm0, %ymm10, %ymm0
+; AVX-NEXT: vandps %ymm1, %ymm10, %ymm1
; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX-NEXT: vmovdqa 16(%r8), %xmm7
; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa 16(%r9), %xmm11
-; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; AVX-NEXT: vmovdqa 16(%r9), %xmm10
+; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
; AVX-NEXT: vmovdqa 16(%rax), %xmm8
; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,2,3,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3],xmm1[4,5,6,7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpsrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm8[3],xmm6[4,5,6,7]
@@ -4389,65 +4388,65 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vandnps %ymm1, %ymm6, %ymm1
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 32(%rsi), %xmm11
+; AVX-NEXT: vmovdqa 32(%rsi), %xmm10
; AVX-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX-NEXT: vpsrld $16, %xmm11, %xmm0
+; AVX-NEXT: vpsrld $16, %xmm10, %xmm0
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm12[0],xmm0[0],xmm12[1],xmm0[1]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vmovdqa 32(%rcx), %xmm7
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX-NEXT: vmovdqa 32(%rdx), %xmm10
-; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[2,2,2,2]
+; AVX-NEXT: vmovdqa 32(%rdx), %xmm8
+; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm8[2,2,2,2]
; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm6[6],xmm1[7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
; AVX-NEXT: vmovdqa %xmm7, %xmm1
; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,4,5,6,6]
-; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
+; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm9[0,1,2,3,4,5,6,6]
+; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,1,2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm15, %ymm6
; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
; AVX-NEXT: vandnps %ymm0, %ymm7, %ymm0
; AVX-NEXT: vandps %ymm7, %ymm6, %ymm6
-; AVX-NEXT: vorps %ymm0, %ymm6, %ymm7
-; AVX-NEXT: vmovdqa 32(%r8), %xmm14
-; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vorps %ymm0, %ymm6, %ymm15
+; AVX-NEXT: vmovdqa 32(%r8), %xmm7
+; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 32(%r9), %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 32(%rax), %xmm6
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm15 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm6[3],xmm15[4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm0[0,2],xmm6[1,3]
-; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
-; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX-NEXT: vandps %ymm7, %ymm15, %ymm7
-; AVX-NEXT: vandnps %ymm14, %ymm15, %ymm14
-; AVX-NEXT: vorps %ymm7, %ymm14, %ymm7
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm14 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm6[3],xmm14[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm0[0,2],xmm6[1,3]
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13
+; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX-NEXT: vandps %ymm7, %ymm15, %ymm14
+; AVX-NEXT: vandnps %ymm13, %ymm7, %ymm13
+; AVX-NEXT: vorps %ymm13, %ymm14, %ymm7
; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpsrld $16, %xmm3, %xmm7
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; AVX-NEXT: vpsrld $16, %xmm3, %xmm13
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6]
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm7, %ymm3
-; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpsrld $16, %xmm9, %xmm4
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm13, %ymm3
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4
+; AVX-NEXT: vpsrld $16, %xmm7, %xmm4
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4
; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
; AVX-NEXT: vandnps %ymm3, %ymm7, %ymm3
; AVX-NEXT: vandps %ymm7, %ymm4, %ymm4
; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3]
; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4461,11 +4460,12 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpsrld $16, %xmm1, %xmm2
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[0,0,1,1]
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
+; AVX-NEXT: vmovdqa %xmm8, %xmm11
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,0,1,1]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
@@ -4479,72 +4479,69 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm6[0,0,0,0]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3]
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[0]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0
; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa (%rdx), %xmm9
-; AVX-NEXT: vmovdqa (%rcx), %xmm7
-; AVX-NEXT: vpsrld $16, %xmm7, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,1,1]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2
-; AVX-NEXT: vmovdqa (%rsi), %xmm0
+; AVX-NEXT: vmovdqa (%rdx), %xmm7
+; AVX-NEXT: vmovdqa (%rcx), %xmm3
+; AVX-NEXT: vpsrld $16, %xmm3, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,0,1,1]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm10, %ymm0
+; AVX-NEXT: vmovdqa (%rsi), %xmm13
; AVX-NEXT: vmovdqa (%rdi), %xmm14
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,2,1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11
-; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2
-; AVX-NEXT: vandps %ymm5, %ymm11, %ymm5
-; AVX-NEXT: vorps %ymm2, %ymm5, %ymm15
-; AVX-NEXT: vmovdqa (%r8), %xmm1
-; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10
+; AVX-NEXT: vandnps %ymm0, %ymm5, %ymm0
+; AVX-NEXT: vandps %ymm5, %ymm10, %ymm5
+; AVX-NEXT: vorps %ymm0, %ymm5, %ymm15
+; AVX-NEXT: vmovdqa (%r8), %xmm0
+; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovdqa (%r9), %xmm5
; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa (%rax), %xmm2
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[0,1,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm12[5],xmm1[6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm12[5],xmm0[6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,1,0,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,0,0,0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1
-; AVX-NEXT: vandps %ymm4, %ymm15, %ymm11
-; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1
-; AVX-NEXT: vorps %ymm1, %ymm11, %ymm1
-; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[3,3,3,3,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[2,2,2,2]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm11[6],xmm1[7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,6]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
-; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0
-; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1
+; AVX-NEXT: vinsertps {{.*#+}} xmm12 = xmm12[0,1,2],xmm2[0]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm0
+; AVX-NEXT: vandps %ymm4, %ymm15, %ymm12
+; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0
+; AVX-NEXT: vorps %ymm0, %ymm12, %ymm0
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vpsrld $16, %xmm13, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,1,0,1]
+; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm0
+; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[3,3,3,3,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,4,4,4]
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[2,2,2,2]
+; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm13[6],xmm12[7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,6]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
+; AVX-NEXT: vandnps %ymm0, %ymm14, %ymm0
+; AVX-NEXT: vandps %ymm1, %ymm14, %ymm1
; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3],xmm1[4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm2[1,3]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[1,3]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1
@@ -4552,194 +4549,190 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX-NEXT: vpsrld $16, %xmm2, %xmm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,5,6,6]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT: vpsrld $16, %xmm9, %xmm1
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT: vpsrld $16, %xmm5, %xmm1
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
-; AVX-NEXT: vandnps %ymm0, %ymm3, %ymm0
-; AVX-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
+; AVX-NEXT: vandnps %ymm0, %ymm5, %ymm0
+; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1
; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,1,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm3[5],xmm1[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm5[0,2],xmm13[1,3]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
-; AVX-NEXT: vandps %ymm3, %ymm0, %ymm0
-; AVX-NEXT: vandnps %ymm1, %ymm3, %ymm1
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,1,0,1]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm5[5],xmm1[6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm9[0,2],xmm10[1,3]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
+; AVX-NEXT: vandps %ymm5, %ymm0, %ymm0
+; AVX-NEXT: vandnps %ymm1, %ymm5, %ymm1
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpsrlq $48, %xmm2, %xmm0
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm4[1],xmm0[1]
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm8[1],xmm0[1]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX-NEXT: vpsrld $16, %xmm2, %xmm1
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
-; AVX-NEXT: vandps %ymm5, %ymm0, %ymm0
-; AVX-NEXT: vandnps %ymm3, %ymm5, %ymm3
-; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm1[2],xmm5[3],xmm1[3]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,2,2,2,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,4]
+; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0],xmm8[1],xmm5[2,3,4,5,6,7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
+; AVX-NEXT: vandps %ymm0, %ymm8, %ymm0
+; AVX-NEXT: vandnps %ymm1, %ymm8, %ymm1
+; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa %xmm10, %xmm4
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm9[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[2,2,3,3]
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm1
+; AVX-NEXT: vandnps %ymm0, %ymm14, %ymm0
+; AVX-NEXT: vandps %ymm1, %ymm14, %ymm1
+; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX-NEXT: vpslldq {{.*#+}} xmm3 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,2,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
-; AVX-NEXT: vandnps %ymm0, %ymm12, %ymm0
-; AVX-NEXT: vandps %ymm3, %ymm12, %ymm3
-; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,4]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm5[1],xmm3[2,3,4,5,6,7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,2,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,2,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3],xmm5[4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
+; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm8[1],xmm1[2,3,4,5,6,7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
+; AVX-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0],xmm6[2],xmm8[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1
; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0
-; AVX-NEXT: vandps %ymm3, %ymm15, %ymm3
-; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0
+; AVX-NEXT: vandps %ymm1, %ymm15, %ymm1
+; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpsrlq $48, %xmm2, %xmm0
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm4[1],xmm0[1]
-; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = mem[0,0,1,1]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
-; AVX-NEXT: # xmm3 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm5 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3
+; AVX-NEXT: vpsrlq $48, %xmm12, %xmm0
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm11[1],xmm0[1]
+; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = mem[0,0,1,1]
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX-NEXT: # xmm8 = xmm8[0],mem[0],xmm8[1],mem[1],xmm8[2],mem[2],xmm8[3],mem[3]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,2,1]
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm9
; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
; AVX-NEXT: vandnps %ymm0, %ymm8, %ymm0
-; AVX-NEXT: vandps %ymm3, %ymm8, %ymm3
-; AVX-NEXT: vorps %ymm0, %ymm3, %ymm3
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
-; AVX-NEXT: # xmm14 = mem[0,1,2],xmm2[3],mem[4,5,6,7]
-; AVX-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = mem[0,1,0,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,0,0,0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm10[6,7]
-; AVX-NEXT: vpsrld $16, %xmm1, %xmm10
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
+; AVX-NEXT: vandps %ymm8, %ymm9, %ymm9
+; AVX-NEXT: vorps %ymm0, %ymm9, %ymm11
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT: vpblendw $247, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = mem[0,1,2],xmm12[3],mem[4,5,6,7]
+; AVX-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; AVX-NEXT: # xmm9 = mem[0,1,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,0,0]
+; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm12[6,7]
+; AVX-NEXT: vpsrld $16, %xmm2, %xmm12
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX-NEXT: vandps %ymm6, %ymm3, %ymm3
-; AVX-NEXT: vandnps %ymm5, %ymm6, %ymm5
-; AVX-NEXT: vorps %ymm5, %ymm3, %ymm5
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
-; AVX-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm10, %ymm3
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpslldq {{.*#+}} xmm10 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,2,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm10, %ymm10
-; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3
-; AVX-NEXT: vandps %ymm12, %ymm10, %ymm10
-; AVX-NEXT: vorps %ymm3, %ymm10, %ymm3
-; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
+; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6
+; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX-NEXT: vandps %ymm9, %ymm11, %ymm11
+; AVX-NEXT: vandnps %ymm6, %ymm9, %ymm6
+; AVX-NEXT: vorps %ymm6, %ymm11, %ymm6
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
+; AVX-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm11[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm12, %ymm11
+; AVX-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[2,2,3,3]
+; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12
+; AVX-NEXT: vandnps %ymm11, %ymm14, %ymm11
+; AVX-NEXT: vandps %ymm14, %ymm12, %ymm12
+; AVX-NEXT: vorps %ymm11, %ymm12, %ymm11
+; AVX-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3,4,5,6,7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,2,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10
-; AVX-NEXT: vandnps %ymm3, %ymm15, %ymm3
-; AVX-NEXT: vandps %ymm15, %ymm10, %ymm10
-; AVX-NEXT: vorps %ymm3, %ymm10, %ymm3
-; AVX-NEXT: vpsrlq $48, %xmm7, %xmm7
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm9[1],xmm7[1]
-; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX-NEXT: # xmm9 = mem[0,0,1,1]
-; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX-NEXT: # xmm9 = xmm9[0],mem[0],xmm9[1],mem[1],xmm9[2],mem[2],xmm9[3],mem[3]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm1[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
+; AVX-NEXT: vinsertps {{.*#+}} xmm13 = xmm13[0],xmm1[2],xmm13[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
+; AVX-NEXT: vandnps %ymm11, %ymm15, %ymm11
+; AVX-NEXT: vandps %ymm15, %ymm12, %ymm12
+; AVX-NEXT: vorps %ymm11, %ymm12, %ymm11
+; AVX-NEXT: vpsrlq $48, %xmm3, %xmm3
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm7[1],xmm3[1]
+; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; AVX-NEXT: # xmm7 = mem[0,0,1,1]
+; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
; AVX-NEXT: vpsrldq {{.*#+}} xmm4 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,1]
-; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm4, %ymm4
-; AVX-NEXT: vandnps %ymm7, %ymm8, %ymm7
+; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
+; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4
+; AVX-NEXT: vandnps %ymm3, %ymm8, %ymm3
; AVX-NEXT: vandps %ymm4, %ymm8, %ymm4
-; AVX-NEXT: vorps %ymm7, %ymm4, %ymm4
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; AVX-NEXT: # xmm7 = mem[0,1,0,1]
-; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm13[0,0,0,0]
-; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3]
-; AVX-NEXT: vpsrld $16, %xmm2, %xmm8
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1
-; AVX-NEXT: vandps %ymm6, %ymm4, %ymm2
-; AVX-NEXT: vandnps %ymm1, %ymm6, %ymm1
-; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = mem[0,1,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm10[0,0,0,0]
+; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm7
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX-NEXT: vandps %ymm3, %ymm9, %ymm3
+; AVX-NEXT: vandnps %ymm2, %ymm9, %ymm2
+; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovaps %ymm1, 96(%rax)
-; AVX-NEXT: vmovaps %ymm3, 64(%rax)
-; AVX-NEXT: vmovaps %ymm5, 320(%rax)
+; AVX-NEXT: vmovaps %ymm2, 96(%rax)
+; AVX-NEXT: vmovaps %ymm11, 64(%rax)
+; AVX-NEXT: vmovaps %ymm6, 320(%rax)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, 288(%rax)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, 192(%rax)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, 128(%rax)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 32(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm0, 32(%rax)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm1, (%rax)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -4754,10 +4747,10 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovaps %xmm1, 416(%rax)
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm1, 432(%rax)
-; AVX-NEXT: vmovdqa %xmm14, 384(%rax)
+; AVX-NEXT: vmovdqa %xmm5, 384(%rax)
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, 400(%rax)
-; AVX-NEXT: addq $584, %rsp # imm = 0x248
+; AVX-NEXT: addq $568, %rsp # imm = 0x238
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -5859,1245 +5852,1205 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX512-LABEL: store_i16_stride7_vf32:
; AVX512: # %bb.0:
-; AVX512-NEXT: subq $680, %rsp # imm = 0x2A8
-; AVX512-NEXT: vmovdqa (%rcx), %ymm1
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm13, %ymm1, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm27
-; AVX512-NEXT: vmovdqa (%rdx), %ymm8
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb %ymm14, %ymm8, %ymm1
-; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa (%rsi), %ymm9
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm12, %ymm9, %ymm0
-; AVX512-NEXT: vmovdqa (%rdi), %ymm11
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-NEXT: vpshufb %ymm15, %ymm11, %ymm1
-; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa (%r9), %ymm1
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm16
-; AVX512-NEXT: vmovdqa (%r8), %ymm4
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm3
-; AVX512-NEXT: vmovdqa64 %ymm4, %ymm17
-; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa 32(%r9), %xmm2
-; AVX512-NEXT: vmovdqa 32(%r8), %xmm10
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
-; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,18,19,0,19,19,0,0,0,1,0,1,2,0,0,3]
-; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX512-NEXT: vmovdqa 32(%rsi), %ymm10
-; AVX512-NEXT: vpshufb %ymm12, %ymm10, %ymm3
-; AVX512-NEXT: vpshufb %ymm15, %ymm2, %ymm4
-; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm3
-; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 32(%rcx), %ymm12
-; AVX512-NEXT: vmovdqa 32(%rdx), %ymm15
-; AVX512-NEXT: vpshufb %ymm13, %ymm12, %ymm3
-; AVX512-NEXT: vpshufb %ymm14, %ymm15, %ymm4
-; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm3
-; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 32(%r9), %ymm13
-; AVX512-NEXT: vmovdqa 32(%r8), %ymm14
-; AVX512-NEXT: vpshufb %ymm0, %ymm13, %ymm0
-; AVX512-NEXT: vpshufb %ymm1, %ymm14, %ymm1
-; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: subq $632, %rsp # imm = 0x278
+; AVX512-NEXT: vmovdqa 32(%rsi), %ymm6
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpshufb %ymm8, %ymm6, %ymm0
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm7
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512-NEXT: vpshufb %ymm1, %ymm7, %ymm2
+; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa (%rcx), %xmm0
-; AVX512-NEXT: vmovdqa (%rdx), %xmm3
-; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm4
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
-; AVX512-NEXT: vpermi2d %zmm5, %zmm4, %zmm26
-; AVX512-NEXT: vmovdqa (%r9), %xmm4
-; AVX512-NEXT: vmovdqa (%r8), %xmm5
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0]
-; AVX512-NEXT: vpermi2d %zmm7, %zmm6, %zmm25
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm0
-; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT: vmovdqa (%rax), %ymm3
-; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-NEXT: vpshufb %ymm7, %ymm3, %ymm6
-; AVX512-NEXT: vmovdqa64 %ymm7, %ymm21
-; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqa 32(%rcx), %ymm4
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm0
+; AVX512-NEXT: vmovdqa 32(%rdx), %ymm9
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb %ymm5, %ymm9, %ymm2
+; AVX512-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm28
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm29
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm30
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm31
-; AVX512-NEXT: vprold $16, %ymm13, %ymm4
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512-NEXT: vmovdqa 32(%r9), %ymm14
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-NEXT: vpshufb %ymm2, %ymm14, %ymm12
+; AVX512-NEXT: vmovdqa 32(%r8), %ymm10
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-NEXT: vpshufb %ymm0, %ymm10, %ymm13
+; AVX512-NEXT: vpor %ymm12, %ymm13, %ymm12
+; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa (%rcx), %ymm15
+; AVX512-NEXT: vpshufb %ymm3, %ymm15, %ymm3
+; AVX512-NEXT: vmovdqa (%rdx), %ymm12
+; AVX512-NEXT: vpshufb %ymm5, %ymm12, %ymm5
+; AVX512-NEXT: vmovdqa64 %ymm12, %ymm29
+; AVX512-NEXT: vpor %ymm3, %ymm5, %ymm3
; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512-NEXT: vmovdqa (%rsi), %ymm3
; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm27, %ymm3
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm3
+; AVX512-NEXT: vmovdqa (%rdi), %ymm13
+; AVX512-NEXT: vpshufb %ymm1, %ymm13, %ymm1
+; AVX512-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa (%r9), %ymm3
+; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm1
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm27
+; AVX512-NEXT: vmovdqa (%r8), %ymm2
+; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm17
+; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa 32(%r9), %xmm2
+; AVX512-NEXT: vmovdqa 32(%r8), %xmm0
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,18,19,0,19,19,0,0,0,1,0,1,2,0,0,3]
+; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa (%rcx), %xmm1
+; AVX512-NEXT: vmovdqa (%rdx), %xmm0
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
+; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
+; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa (%r9), %xmm2
+; AVX512-NEXT: vmovdqa (%r8), %xmm3
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5,7,6]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0]
+; AVX512-NEXT: vpermi2d %zmm8, %zmm5, %zmm28
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[1,1,1,1,5,5,5,5]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7,8,9],ymm8[10],ymm5[11,12],ymm8[13],ymm5[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7,8,9],ymm5[10],ymm8[11,12],ymm5[13],ymm8[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2]
+; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm10[0,0,2,1,4,4,6,5]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7,8,9,10],ymm8[11],ymm5[12,13],ymm8[14],ymm5[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
+; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7,8,9],ymm8[10],ymm5[11,12],ymm8[13],ymm5[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7]
; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm16, %ymm4
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512-NEXT: vmovdqa64 %ymm4, %ymm19
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX512-NEXT: vmovdqa 32(%rsi), %xmm5
-; AVX512-NEXT: vprold $16, %xmm5, %xmm6
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
-; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa 32(%rcx), %xmm4
-; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm1
-; AVX512-NEXT: vmovdqa 32(%rdx), %xmm5
-; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7]
-; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm18
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
-; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX512-NEXT: vprold $16, %ymm14, %ymm5
+; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[1,2,2,3,5,6,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
+; AVX512-NEXT: vmovdqa64 %ymm10, %ymm16
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm1
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT: vmovdqa (%rax), %ymm3
+; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[16,17,u,u]
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm12
+; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm4[2,1,3,3]
+; AVX512-NEXT: vmovdqa %ymm15, %ymm6
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm5[2,1,3,2]
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512-NEXT: vmovdqa64 %ymm27, %ymm9
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7]
; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512-NEXT: vmovdqa 32(%rax), %ymm2
-; AVX512-NEXT: vpermd %zmm2, %zmm1, %zmm27
-; AVX512-NEXT: vmovdqa64 %ymm21, %ymm1
-; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm5
-; AVX512-NEXT: vpshufb %xmm4, %xmm6, %xmm14
-; AVX512-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512-NEXT: vmovdqa (%rsi), %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT: vprold $16, %xmm2, %xmm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm24
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
; AVX512-NEXT: vmovdqa64 %ymm1, %ymm23
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm22
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
-; AVX512-NEXT: vmovdqa64 %ymm16, %ymm4
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
-; AVX512-NEXT: vprold $16, %ymm16, %ymm1
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[1,2,2,3,5,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7,8,9],ymm1[10],ymm10[11,12],ymm1[13],ymm10[14,15]
-; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa 32(%rsi), %xmm2
+; AVX512-NEXT: vprold $16, %xmm2, %xmm3
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm27
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm24
+; AVX512-NEXT: vmovdqa 32(%rcx), %xmm1
+; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-NEXT: vmovdqa 32(%rdx), %xmm4
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5,6],xmm2[7]
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm31
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm1
+; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm10
+; AVX512-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512-NEXT: vmovdqa (%rsi), %xmm4
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa64 %ymm0, %ymm25
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-NEXT: vprold $16, %xmm4, %xmm4
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm22
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15]
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm21
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm29
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm20
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm19
+; AVX512-NEXT: vmovdqa %ymm9, %ymm5
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[0,0,2,1,4,4,6,5]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX512-NEXT: vprold $16, %ymm5, %ymm3
+; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm17[1,2,2,3,5,6,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0,1],ymm3[2],ymm13[3,4],ymm3[5],ymm13[6,7,8,9],ymm3[10],ymm13[11,12],ymm3[13],ymm13[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm16[2,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1]
-; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3]
-; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm6 = mem[2,1,3,2]
-; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm5 = mem[2,2,2,3]
-; AVX512-NEXT: vpermq $232, (%rsp), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm1 = mem[0,2,2,3]
-; AVX512-NEXT: vmovdqa64 %xmm18, %xmm0
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
-; AVX512-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm0 = mem[2,1,3,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1]
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm19[2,2,2,3]
-; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm15 = mem[0,1,3,2,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3]
-; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm28
-; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm29
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm30 & (zmm29 ^ zmm28))
-; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
-; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm1 ^ (zmm30 & (zmm9 ^ zmm1))
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm3
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
-; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm1 = mem[2,3,3,3,6,7,7,7]
-; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm3 = mem[0,0,2,1]
-; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm10 = mem[2,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,1,1]
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm11[0,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,3,2,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,0,1,1]
+; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm8[2,1,3,3]
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 32-byte Folded Reload
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm1))
+; AVX512-NEXT: vmovdqa64 (%rax), %zmm2
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [11,0,0,11,0,0,0,12]
+; AVX512-NEXT: vpermd %zmm2, %zmm1, %zmm1
+; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,0,12,13,0,0,13,15]
+; AVX512-NEXT: vpermd %zmm2, %zmm17, %zmm17
+; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm1, %zmm17
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm1 = zmm1[0,1,2,3],mem[0,1,2,3]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm17 ^ (mem & (zmm1 ^ zmm17))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm8))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm11, %zmm17
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm18 & (zmm17 ^ zmm8))
+; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm26, %zmm13
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm17))
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15]
+; AVX512-NEXT: vpermd %zmm2, %zmm8, %zmm8
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm13))
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm13 # 32-byte Folded Reload
+; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm10[0,0,1,1]
+; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm23[2,2,2,3]
+; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm11 = mem[0,1,3,2,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,1,3]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm30[2,3,3,3,6,7,7,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm23 = ymm27[0,0,2,1]
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm10
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
-; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm12 = mem[0,0,1,1]
-; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
-; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm17 = mem[2,2,2,3]
-; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm18 = mem[2,1,3,2]
-; AVX512-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm19 = mem[2,2,3,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,1,1]
-; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm24[0,0,2,1]
-; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm23[2,1,3,2]
-; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm22[2,2,2,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm31[0,0,1,1]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
+; AVX512-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,0,1,1]
+; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm22[0,0,2,1]
+; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm21[2,1,3,2]
+; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,2,2,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm20[2,2,2,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm19[0,2,2,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
-; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm5))
-; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1]
-; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm13 ^ (zmm21 & (zmm0 ^ zmm13))
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm3 & mem)
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0))
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm5, %zmm3
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm18 & (zmm3 ^ zmm0))
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,0,0,1]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm17[2,1,3,2]
; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm5
-; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm9))
-; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3
-; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm4
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm2 & (zmm4 ^ zmm3))
-; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm2
-; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm3
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm3))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm4))
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, %ymm18, %zmm4, %zmm4
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 | (zmm3 & mem)
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm4))
-; AVX512-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm3
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm26 ^ (mem & (zmm3 ^ zmm26))
-; AVX512-NEXT: vpbroadcastd (%rax), %ymm4
-; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm5
; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm25))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm0))
; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
-; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm3
-; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm5
+; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm23, %zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm24, %zmm3
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm21 & (zmm3 ^ zmm0))
+; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm0
+; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm5
+; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3))
+; AVX512-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm3
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (mem & (zmm3 ^ zmm5))
+; AVX512-NEXT: vpbroadcastd (%rax), %ymm5
+; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm10
+; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm28))
; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm3
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512-NEXT: vpermd (%rax), %zmm6, %zmm6
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovdqa64 %zmm6, 128(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm4, (%rax)
-; AVX512-NEXT: vmovdqa64 %zmm20, 320(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm27, 384(%rax)
-; AVX512-NEXT: addq $680, %rsp # imm = 0x2A8
+; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm27, %zmm3
+; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm10
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm3))
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm3
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512-NEXT: vpermd %zmm2, %zmm7, %zmm2
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm3))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm10))
+; AVX512-NEXT: vmovdqa64 %zmm2, 128(%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm5, (%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm1, 320(%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm0, 256(%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm12, 64(%rcx)
+; AVX512-NEXT: vmovdqa64 %zmm8, 384(%rcx)
+; AVX512-NEXT: addq $632, %rsp # imm = 0x278
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: store_i16_stride7_vf32:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $264, %rsp # imm = 0x108
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm12
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm2
-; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm16
-; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: subq $184, %rsp
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm8
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm11
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm2
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm3
; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm11
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm10
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm2
-; AVX512-FCP-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm15
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm5
-; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpor %ymm2, %ymm5, %ymm2
-; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm8
-; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm9
-; AVX512-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm14
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm2
+; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm6
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm23
+; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm3
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm6
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm9
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7
+; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm22
+; AVX512-FCP-NEXT: vporq %ymm6, %ymm7, %ymm25
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm7
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm4
+; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm6
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm21
+; AVX512-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm3
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm9
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4
-; AVX512-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
-; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm13
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0
-; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm14
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm1
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm24
; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm19
-; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm25
-; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm20
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm15
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm0
+; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm1
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0]
-; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm3, %zmm24
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15]
-; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm4
-; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm3
-; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,2,2,3,8,0,9,0]
-; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm23
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm1
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,2,2,3,10,0,11,0]
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm17
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm7
+; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm2
+; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,2,2,3,8,0,9,0]
+; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm18
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm24[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm8
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9]
-; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm21
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,0,8,8,9]
-; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm3
-; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm7
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm15
-; AVX512-FCP-NEXT: vpermi2q %zmm15, %zmm11, %zmm22
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
-; AVX512-FCP-NEXT: vprold $16, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm0
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9]
+; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm19
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,0,8,8,9]
+; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm2
+; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm5
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm13
+; AVX512-FCP-NEXT: vpermi2q %zmm13, %zmm10, %zmm20
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-FCP-NEXT: vprold $16, %xmm0, %xmm0
; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3,4],xmm8[5],xmm1[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm8
-; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm11
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm1
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0]
-; AVX512-FCP-NEXT: vpermi2q %zmm8, %zmm1, %zmm26
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,1,1,0,8,8,9,9]
-; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm28
+; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm26
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm1
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,1,0,8,8,9,9]
+; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm29
; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm8
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,1,0,8,8,9,9]
-; AVX512-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm27
-; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm6
-; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm15
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0]
-; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm29
-; AVX512-FCP-NEXT: vprold $16, %ymm5, %ymm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15]
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8,9,10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15]
+; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm27
+; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm3
+; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm5
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm6
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,8,9,9,0]
+; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm28
+; AVX512-FCP-NEXT: vprold $16, %ymm15, %ymm5
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm10
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7,8,9,10],ymm4[11],ymm10[12,13],ymm4[14],ymm10[15]
; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,0,11,10]
-; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm31
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm18
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm5
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
+; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm31
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm16
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm23[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm12
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm4
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,0,3,10,10,11,11]
+; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm25, %zmm30
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm0
; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm8
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm1
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm16
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm12
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm0
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm25[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11]
-; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm20, %zmm9
-; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm0
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
-; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm4
-; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15]
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm0
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm0
-; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm25[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7,8,9],ymm12[10],ymm7[11,12],ymm12[13],ymm7[14,15]
-; AVX512-FCP-NEXT: vprold $16, %ymm1, %ymm12
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm25[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3],xmm12[4],xmm14[5,6],xmm12[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,1,1,8,8,10,9]
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm12
-; AVX512-FCP-NEXT: vprold $16, %xmm3, %xmm1
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1],xmm1[2],xmm13[3,4],xmm1[5],xmm13[6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm5
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm15
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm6
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm10[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm7[2],ymm10[3,4],ymm7[5],ymm10[6,7,8,9],ymm7[10],ymm10[11,12],ymm7[13],ymm10[14,15]
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm24 ^ (zmm1 & (zmm0 ^ zmm24))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm0))
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7]
-; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermd %zmm2, %zmm0, %zmm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm15))
-; AVX512-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm26 ^ (zmm6 & (zmm12 ^ zmm26))
-; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm8
-; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm10
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm28))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm12))
-; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm10
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm17[0,0,1,3]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm16[2,2,2,3]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm3))
-; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload
+; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[16,17,u,u]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm25
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm0
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT: vmovdqa %ymm12, %ymm14
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7,8,9],ymm12[10],ymm3[11,12],ymm12[13],ymm3[14,15]
+; AVX512-FCP-NEXT: vprold $16, %ymm14, %ymm3
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7,8,9],ymm3[10],ymm12[11,12],ymm3[13],ymm12[14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3],xmm3[4],xmm15[5,6],xmm3[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,1,8,8,10,9]
+; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm3
+; AVX512-FCP-NEXT: vprold $16, %xmm1, %xmm7
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[1,1,2,3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm7
+; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm7
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,1,3,2,10,10,10,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm12
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm2
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7,8,9],ymm6[10],ymm9[11,12],ymm6[13],ymm9[14,15]
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm9
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,2,3,8,10,10,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm17 ^ (zmm4 & (zmm0 ^ zmm17))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0))
+; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [12,13,10,10,14,14,14,14]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm6, %zmm6
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [14,15,11,11,15,15,14,15]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm8, %zmm8
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm12))
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [6,0,0,0,7,0,0,7]
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm26 ^ (zmm8 & (zmm3 ^ zmm26))
+; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm12
+; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm14
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm29))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm16[2,2,2,3]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,3,2]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm3))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm7))
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13,14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm16[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm14
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm7))
-; AVX512-FCP-NEXT: vpermd (%rax), %zmm19, %zmm3
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm31))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm14))
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6,7,8],ymm3[9],ymm7[10,11],ymm3[12],ymm7[13,14,15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm24[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7,8,9],ymm15[10],ymm7[11,12],ymm15[13],ymm7[14,15]
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm9))
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
+; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm31))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm7))
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm3 # 32-byte Folded Reload
; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm7 ^ (zmm6 & (zmm10 ^ zmm7))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm23 ^ (zmm1 & (zmm21 ^ zmm23))
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm30
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm10))
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm9 & mem)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6))
-; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm22))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm21))
-; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm5
-; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm27))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm3 ^ (zmm8 & (zmm7 ^ zmm3))
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm3))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm18 ^ (zmm4 & (zmm19 ^ zmm18))
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [11,0,0,11,0,0,0,12]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [8,9,12,13,12,13,13,15]
+; AVX512-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm0
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm30))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm8))
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 | (zmm1 & mem)
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (mem & (zmm25 ^ zmm7))
+; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm1
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm20))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm19))
+; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm3
+; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm4
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm28))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm27))
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 384(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
-; AVX512-FCP-NEXT: addq $264, %rsp # imm = 0x108
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 64(%rax)
+; AVX512-FCP-NEXT: addq $184, %rsp
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: store_i16_stride7_vf32:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: subq $680, %rsp # imm = 0x2A8
-; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm13, %ymm1, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm27
-; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm8
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb %ymm14, %ymm8, %ymm1
-; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm9
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm12, %ymm9, %ymm0
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm11
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm15 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-NEXT: vpshufb %ymm15, %ymm11, %ymm1
-; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%r9), %ymm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm16
-; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm3
-; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17
-; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm2
-; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm10
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3]
-; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,18,19,0,19,19,0,0,0,1,0,1,2,0,0,3]
-; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm10
-; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm15, %ymm2, %ymm4
-; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm12
-; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm15
-; AVX512DQ-NEXT: vpshufb %ymm13, %ymm12, %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm14, %ymm15, %ymm4
-; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm13
-; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm14
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm13, %ymm0
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm14, %ymm1
-; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: subq $632, %rsp # imm = 0x278
+; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm6
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm6, %ymm0
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm7
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm7, %ymm2
+; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm0
-; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3
-; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm4
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
-; AVX512DQ-NEXT: vpermi2d %zmm5, %zmm4, %zmm26
-; AVX512DQ-NEXT: vmovdqa (%r9), %xmm4
-; AVX512DQ-NEXT: vmovdqa (%r8), %xmm5
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0]
-; AVX512DQ-NEXT: vpermi2d %zmm7, %zmm6, %zmm25
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm0
-; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa (%rax), %ymm3
-; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm7, %ymm3, %ymm6
-; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm21
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
+; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm4
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm0
+; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm9
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb %ymm5, %ymm9, %ymm2
+; AVX512DQ-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm28
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm29
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm30
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm31
-; AVX512DQ-NEXT: vprold $16, %ymm13, %ymm4
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm14
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm14, %ymm12
+; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm10
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm10, %ymm13
+; AVX512DQ-NEXT: vpor %ymm12, %ymm13, %ymm12
+; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm15
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm15, %ymm3
+; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm12
+; AVX512DQ-NEXT: vpshufb %ymm5, %ymm12, %ymm5
+; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm29
+; AVX512DQ-NEXT: vpor %ymm3, %ymm5, %ymm3
; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3
; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm3
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm3
+; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm13
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm13, %ymm1
+; AVX512DQ-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa (%r9), %ymm3
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm3, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm27
+; AVX512DQ-NEXT: vmovdqa (%r8), %ymm2
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm17
+; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm2
+; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm0
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,18,19,0,19,19,0,0,0,1,0,1,2,0,0,3]
+; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1
+; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm0
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,16,0,0,17,17,0,0,0,0,0,1,2,0,0,3]
+; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
+; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa (%r9), %xmm2
+; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm5[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,0,1,0,1,1,0,0,18,19,0,19,19,0,0]
+; AVX512DQ-NEXT: vpermi2d %zmm8, %zmm5, %zmm28
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7,8,9],ymm8[10],ymm5[11,12],ymm8[13],ymm5[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm5[2],ymm8[3,4],ymm5[5],ymm8[6,7,8,9],ymm5[10],ymm8[11,12],ymm5[13],ymm8[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2]
+; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm10[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7,8,9,10],ymm8[11],ymm5[12,13],ymm8[14],ymm5[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4],ymm8[5],ymm5[6,7,8,9],ymm8[10],ymm5[11,12],ymm8[13],ymm5[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm9[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[3,3,3,3,7,7,7,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
-; AVX512DQ-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm17[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm19
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4
-; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm5
-; AVX512DQ-NEXT: vprold $16, %xmm5, %xmm6
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,1,2,3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2],xmm7[3,4],xmm6[5],xmm7[6,7]
-; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm4, %xmm1
-; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm5
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3],xmm1[4],xmm7[5,6],xmm1[7]
-; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm18
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm2[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
-; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX512DQ-NEXT: vprold $16, %ymm14, %ymm5
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm10[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm14[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm16
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm1
+; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa (%rax), %ymm3
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[16,17,u,u]
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm12
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm18 = ymm4[2,1,3,3]
+; AVX512DQ-NEXT: vmovdqa %ymm15, %ymm6
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm5[2,1,3,2]
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm9
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[3,3,3,3,7,7,7,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm13[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm2
-; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm27
-; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm5
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm6, %xmm14
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm2
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT: vprold $16, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm24
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm23
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm22
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm9[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6,7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13,14,15]
-; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm4
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm17[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2],ymm2[3],ymm1[4,5],ymm2[6],ymm1[7,8,9,10],ymm2[11],ymm1[12,13],ymm2[14],ymm1[15]
-; AVX512DQ-NEXT: vprold $16, %ymm16, %ymm1
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm17[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm1[2],ymm10[3,4],ymm1[5],ymm10[6,7,8,9],ymm1[10],ymm10[11,12],ymm1[13],ymm10[14,15]
-; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1
+; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm2
+; AVX512DQ-NEXT: vprold $16, %xmm2, %xmm3
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm27
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm24
+; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm1
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm4
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0],xmm2[1],xmm5[2,3],xmm2[4],xmm5[5,6],xmm2[7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm31
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm3, %xmm10
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm3
+; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm4
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm25
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-NEXT: vprold $16, %xmm4, %xmm4
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm22
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm21
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm29
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm20
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm7[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6,7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm5
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm17[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
+; AVX512DQ-NEXT: vprold $16, %ymm5, %ymm3
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm17[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm13[0,1],ymm3[2],ymm13[3,4],ymm3[5],ymm13[6,7,8,9],ymm3[10],ymm13[11,12],ymm3[13],ymm13[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm16[2,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm5[0,0,1,1]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm28[2,2,2,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm29[0,2,2,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm30[0,2,2,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm31[2,1,3,3]
-; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm6 = mem[2,1,3,2]
-; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm5 = mem[2,2,2,3]
-; AVX512DQ-NEXT: vpermq $232, (%rsp), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm1 = mem[0,2,2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm0
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,0,1,1]
-; AVX512DQ-NEXT: vpermq $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm0 = mem[2,1,3,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm14[0,0,1,1]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm19[2,2,2,3]
-; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm15 = mem[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,1,3]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm28, %zmm28
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm29
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm30 & (zmm29 ^ zmm28))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,1,3]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm9
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm1 ^ (zmm30 & (zmm9 ^ zmm1))
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & mem) | zmm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
-; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm1 = mem[2,3,3,3,6,7,7,7]
-; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm3 = mem[0,0,2,1]
-; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm10 = mem[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,1,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm11[0,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,0,1,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm8[2,1,3,3]
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm8 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm1))
+; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm2
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [11,0,0,11,0,0,0,12]
+; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,0,12,13,0,0,13,15]
+; AVX512DQ-NEXT: vpermd %zmm2, %zmm17, %zmm17
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm1, %zmm17
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm1 = zmm1[0,1,2,3],mem[0,1,2,3]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm17 ^ (mem & (zmm1 ^ zmm17))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm8))
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm11, %zmm17
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm18 & (zmm17 ^ zmm8))
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm26, %zmm13
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm17))
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15]
+; AVX512DQ-NEXT: vpermd %zmm2, %zmm8, %zmm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm13))
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm13 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm10[0,0,1,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm16 = ymm23[2,2,2,3]
+; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm11 = mem[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[0,1,1,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm30[2,3,3,3,6,7,7,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm23 = ymm27[0,0,2,1]
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm10
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
-; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm12 = mem[0,0,1,1]
-; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm14 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,2,1]
-; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm17 = mem[2,2,2,3]
-; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm18 = mem[2,1,3,2]
-; AVX512DQ-NEXT: vpermq $250, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm19 = mem[2,2,3,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm21[0,0,1,1]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm24[0,0,2,1]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm23[2,1,3,2]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm22[2,2,2,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm31[0,0,1,1]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,0,1,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm22[0,0,2,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm21[2,1,3,2]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm29[2,2,2,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm20[2,2,2,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm19[0,2,2,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm5))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm15[0,0,0,1]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm21 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm13 ^ (zmm21 & (zmm0 ^ zmm13))
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
+; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm3 & mem)
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm6, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm5, %zmm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm18 & (zmm3 ^ zmm0))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,0,0,1]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm16, %zmm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm17[2,1,3,2]
; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm5
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm9))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm3, %zmm3
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm4
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm3 ^ (zmm2 & (zmm4 ^ zmm3))
-; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm2
-; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm3
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm3))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm4))
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm3, %zmm3
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm18, %zmm4, %zmm4
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm19[0,1,2,3]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 | (zmm3 & mem)
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm4))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm28, %zmm21, %zmm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm26 ^ (mem & (zmm3 ^ zmm26))
-; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm4
-; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm5
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm25))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm0))
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm3
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm5
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm23, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm24, %zmm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm21 & (zmm3 ^ zmm0))
+; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm0
+; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm5
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm5))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm3))
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm26, %zmm25, %zmm3
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm5 ^ (mem & (zmm3 ^ zmm5))
+; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm5
+; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm10
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm5
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm28))
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm3))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm3
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512DQ-NEXT: vpermd (%rax), %zmm6, %zmm6
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm3))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vmovdqa64 %zmm6, 128(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm20, 320(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm27, 384(%rax)
-; AVX512DQ-NEXT: addq $680, %rsp # imm = 0x2A8
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm27, %zmm3
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm10
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (mem & (zmm10 ^ zmm3))
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm3
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512DQ-NEXT: vpermd %zmm2, %zmm7, %zmm2
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm3))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm10))
+; AVX512DQ-NEXT: vmovdqa64 %zmm2, 128(%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm0, 256(%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm12, 64(%rcx)
+; AVX512DQ-NEXT: vmovdqa64 %zmm8, 384(%rcx)
+; AVX512DQ-NEXT: addq $632, %rsp # imm = 0x278
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: store_i16_stride7_vf32:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $264, %rsp # imm = 0x108
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm16
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: subq $184, %rsp
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm3
; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, (%rsp) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm5, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm9
-; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm23
+; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm22
+; AVX512DQ-FCP-NEXT: vporq %ymm6, %ymm7, %ymm25
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm21
+; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm10, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm14
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm24
; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm19
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm25
-; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm1, %ymm20
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm15
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,10,0,11,0]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm3, %zmm24
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5,6,7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm23 = [0,2,2,3,8,0,9,0]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm1, %zmm23
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm17 = [0,2,2,3,10,0,11,0]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm17
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm7[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,2,2,3,8,0,9,0]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm18
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm24[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm8
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm21 = [2,1,3,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm0, %zmm21
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [2,2,2,3,0,8,8,9]
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm7
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm15
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm15, %zmm11, %zmm22
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm0
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm19
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,0,8,8,9]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm5
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm13
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm13, %zmm10, %zmm20
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm0, %xmm0
; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3,4],xmm8[5],xmm1[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm11
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm1
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm26 = [0,0,0,1,8,8,9,0]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm8, %zmm1, %zmm26
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,1,1,0,8,8,9,9]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm28
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm0, %zmm26
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm1
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,1,1,0,8,8,9,9]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm1, %zmm29
; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3],xmm7[4],xmm8[5,6],xmm7[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm27 = [0,0,1,0,8,8,9,9]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm7, %zmm6, %zmm27
-; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm15
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm29 = [0,0,0,1,8,9,9,0]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm4, %zmm29
-; AVX512DQ-FCP-NEXT: vprold $16, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15]
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm7 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7,8,9,10],ymm2[11],ymm5[12,13],ymm2[14],ymm5[15]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm3, %zmm27
+; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm10, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm10, %xmm6
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm28 = [0,0,0,1,8,9,9,0]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm6, %zmm28
+; AVX512DQ-FCP-NEXT: vprold $16, %ymm15, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm6 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm15, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3],ymm10[4,5],ymm4[6],ymm10[7,8,9,10],ymm4[11],ymm10[12,13],ymm4[14],ymm10[15]
; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm31 = [2,2,3,3,10,0,11,10]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm2, %zmm31
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm18
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm11, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm31
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm16
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm23[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [0,1,0,3,10,10,11,11]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm25, %zmm30
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm10
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm0
; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm1
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm30
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm6[4],xmm15[5],xmm6[5],xmm15[6],xmm6[6],xmm15[7],xmm6[7]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm16
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm12
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm25[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm9 = [0,1,0,3,10,10,11,11]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm20, %zmm9
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm19 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
-; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm19, %ymm4
-; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm13[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm14[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15]
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm14[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4],ymm15[5],ymm0[6,7,8,9],ymm15[10],ymm0[11,12],ymm15[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm20 = [2,2,2,3,8,10,10,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm20, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm25[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7,8,9],ymm12[10],ymm7[11,12],ymm12[13],ymm7[14,15]
-; AVX512DQ-FCP-NEXT: vprold $16, %ymm1, %ymm12
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm25[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7,8,9],ymm12[10],ymm15[11,12],ymm12[13],ymm15[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm1[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm14[0],xmm12[1],xmm14[2,3],xmm12[4],xmm14[5,6],xmm12[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [0,0,1,1,8,8,10,9]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm12
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm3, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[1,1,2,3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm13[0,1],xmm1[2],xmm13[3,4],xmm1[5],xmm13[6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm6, %xmm5
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm14, %zmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,1,3,2,10,10,10,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm15
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm10[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7,8,9],ymm7[10],ymm6[11,12],ymm7[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0,1],ymm7[2],ymm10[3,4],ymm7[5],ymm10[6,7,8,9],ymm7[10],ymm10[11,12],ymm7[13],ymm10[14,15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm24 ^ (zmm1 & (zmm0 ^ zmm24))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7]
-; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm0, %zmm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm15))
-; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm26 ^ (zmm6 & (zmm12 ^ zmm26))
-; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm8
-; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm10
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm28))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm12))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm10
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm17[0,0,1,3]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm16[2,2,2,3]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm15 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm5[12,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[14,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm5[16,17,u,u]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm25
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7,8],ymm0[9],ymm3[10,11],ymm0[12],ymm3[13,14,15]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm23[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT: vmovdqa %ymm12, %ymm14
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm12[2],ymm3[3,4],ymm12[5],ymm3[6,7,8,9],ymm12[10],ymm3[11,12],ymm12[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT: vprold $16, %ymm14, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm3[2],ymm12[3,4],ymm3[5],ymm12[6,7,8,9],ymm3[10],ymm12[11,12],ymm3[13],ymm12[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3],xmm3[4],xmm15[5,6],xmm3[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,0,1,1,8,8,10,9]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm3
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm1, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[1,1,2,3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm7[2],xmm14[3,4],xmm7[5],xmm14[6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm7
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm15, %zmm7
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,1,3,2,10,10,10,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm6[2],ymm2[3,4],ymm6[5],ymm2[6,7,8,9],ymm6[10],ymm2[11,12],ymm6[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm14[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm21[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7,8,9],ymm6[10],ymm9[11,12],ymm6[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm9
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,2,3,8,10,10,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm2, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm17 ^ (zmm4 & (zmm0 ^ zmm17))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [12,13,10,10,14,14,14,14]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [14,15,11,11,15,15,14,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm12))
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [6,0,0,0,7,0,0,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm26 ^ (zmm8 & (zmm3 ^ zmm26))
+; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm12
+; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm14
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm12, %zmm12
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm29))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm16[2,2,2,3]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,1,3,2]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,3]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm7))
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm16[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm16[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm20, %zmm14
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm7))
-; AVX512DQ-FCP-NEXT: vpermd (%rax), %zmm19, %zmm3
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm31))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm14))
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2,3],ymm3[4],ymm7[5,6,7,8],ymm3[9],ymm7[10,11],ymm3[12],ymm7[13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm24[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7,8,9],ymm15[10],ymm7[11,12],ymm15[13],ymm7[14,15]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm9))
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
+; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm31))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm7))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm3 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm7 ^ (zmm6 & (zmm10 ^ zmm7))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm23 ^ (zmm1 & (zmm21 ^ zmm23))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = (zmm1 & mem) | zmm30
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm10))
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm6, %zmm6
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm9 & mem)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm6))
-; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm5
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm22))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm21))
-; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm5
-; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm29))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm27))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm3 ^ (zmm8 & (zmm7 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm18 ^ (zmm4 & (zmm19 ^ zmm18))
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [11,0,0,11,0,0,0,12]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [8,9,12,13,12,13,13,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm30))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm8))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 | (zmm1 & mem)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (mem & (zmm25 ^ zmm7))
+; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm1
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm20))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm19))
+; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm3
+; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm4
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm28))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm27))
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 128(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 256(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 384(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
-; AVX512DQ-FCP-NEXT: addq $264, %rsp # imm = 0x108
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 384(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 64(%rax)
+; AVX512DQ-FCP-NEXT: addq $184, %rsp
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -9214,7 +9167,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX-LABEL: store_i16_stride7_vf64:
; AVX: # %bb.0:
-; AVX-NEXT: subq $1496, %rsp # imm = 0x5D8
+; AVX-NEXT: subq $1448, %rsp # imm = 0x5A8
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: vmovdqa 112(%rsi), %xmm1
; AVX-NEXT: vmovdqa 112(%rdi), %xmm4
@@ -9260,9 +9213,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
-; AVX-NEXT: vandnps %ymm11, %ymm15, %ymm11
-; AVX-NEXT: vandps %ymm15, %ymm12, %ymm12
+; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
+; AVX-NEXT: vandnps %ymm11, %ymm13, %ymm11
+; AVX-NEXT: vandps %ymm13, %ymm12, %ymm12
; AVX-NEXT: vorps %ymm11, %ymm12, %ymm11
; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,1,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1,2],xmm12[3],xmm11[4,5,6,7]
@@ -9349,13 +9302,13 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,0,0,0]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa (%rdx), %xmm10
-; AVX-NEXT: vmovdqa (%rcx), %xmm9
-; AVX-NEXT: vpsrld $16, %xmm9, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX-NEXT: vmovdqa (%rdx), %xmm9
+; AVX-NEXT: vmovdqa (%rcx), %xmm8
+; AVX-NEXT: vpsrld $16, %xmm8, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3
; AVX-NEXT: vmovdqa (%rsi), %xmm1
@@ -9381,8 +9334,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4],xmm7[5],xmm6[6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[0,0,0,0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7]
+; AVX-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1,2],xmm4[0]
; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
; AVX-NEXT: vandps %ymm7, %ymm5, %ymm5
@@ -9395,106 +9347,105 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
-; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[3,3,3,3,4,5,6,7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[3,3,3,3,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[2,2,2,2]
+; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[2,2,2,2]
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm5[6],xmm2[7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
-; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1
-; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
+; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vpsrldq {{.*#+}} xmm1 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3],xmm1[4,5,6,7]
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,2],xmm4[1,3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0
+; AVX-NEXT: vandnps %ymm1, %ymm15, %ymm1
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 16(%rdx), %xmm5
-; AVX-NEXT: vmovdqa 16(%rcx), %xmm6
-; AVX-NEXT: vpsrld $16, %xmm6, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6]
+; AVX-NEXT: vmovdqa 16(%rdx), %xmm3
+; AVX-NEXT: vmovdqa 16(%rcx), %xmm5
+; AVX-NEXT: vpsrld $16, %xmm5, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
+; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,4,5,6,6]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vmovdqa 16(%rsi), %xmm10
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm9
-; AVX-NEXT: vpsrld $16, %xmm10, %xmm1
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; AVX-NEXT: vmovdqa 16(%rsi), %xmm9
+; AVX-NEXT: vmovdqa 16(%rdi), %xmm8
+; AVX-NEXT: vpsrld $16, %xmm9, %xmm1
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0
-; AVX-NEXT: vandps %ymm1, %ymm15, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
+; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vmovdqa 16(%r9), %xmm8
-; AVX-NEXT: vmovdqa 16(%r8), %xmm7
-; AVX-NEXT: vmovdqa 16(%rax), %xmm3
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX-NEXT: vmovdqa 16(%r9), %xmm7
+; AVX-NEXT: vmovdqa 16(%r8), %xmm6
+; AVX-NEXT: vmovdqa 16(%rax), %xmm12
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1]
+; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm13[0,2],xmm3[1,3]
-; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm10[0,2],xmm12[1,3]
+; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
-; AVX-NEXT: vandps %ymm0, %ymm14, %ymm0
-; AVX-NEXT: vandnps %ymm1, %ymm14, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[3,3,3,3,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,2,2,2]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0
-; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vandnps %ymm0, %ymm14, %ymm0
+; AVX-NEXT: vandps %ymm1, %ymm14, %ymm1
; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,3,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,2,3,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5,6,7]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[3],xmm2[4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 32(%rdx), %xmm15
-; AVX-NEXT: vmovdqa 32(%rcx), %xmm13
-; AVX-NEXT: vpsrld $16, %xmm13, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; AVX-NEXT: vmovdqa 32(%rdx), %xmm13
+; AVX-NEXT: vmovdqa 32(%rcx), %xmm3
+; AVX-NEXT: vpsrld $16, %xmm3, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3]
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX-NEXT: vmovdqa 32(%rsi), %xmm2
@@ -9513,18 +9464,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 32(%r8), %xmm1
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa 32(%rax), %xmm5
+; AVX-NEXT: vmovdqa 32(%rax), %xmm4
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
; AVX-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,1,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5],xmm9[6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,1,0,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,0,0,0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm11[6,7]
+; AVX-NEXT: vinsertps {{.*#+}} xmm10 = xmm10[0,1,2],xmm4[0]
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
-; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX-NEXT: vandps %ymm3, %ymm8, %ymm8
-; AVX-NEXT: vandnps %ymm9, %ymm3, %ymm9
+; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX-NEXT: vandps %ymm5, %ymm8, %ymm8
+; AVX-NEXT: vandnps %ymm9, %ymm5, %ymm9
; AVX-NEXT: vorps %ymm9, %ymm8, %ymm6
; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpsrld $16, %xmm2, %xmm8
@@ -9533,67 +9483,67 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm8, %ymm2
-; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm13[3,3,3,3,4,5,6,7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[3,3,3,3,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
-; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm15[2,2,2,2]
+; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[2,2,2,2]
; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6],xmm7[7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,6]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX-NEXT: vandnps %ymm2, %ymm12, %ymm2
-; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
+; AVX-NEXT: vandnps %ymm2, %ymm3, %ymm2
+; AVX-NEXT: vandps %ymm3, %ymm0, %ymm0
; AVX-NEXT: vorps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm5[3],xmm2[4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[1,3]
+; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3],xmm2[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[1,3]
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1
+; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0
+; AVX-NEXT: vandnps %ymm1, %ymm15, %ymm1
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa 48(%rdx), %xmm6
; AVX-NEXT: vmovdqa 48(%rcx), %xmm8
; AVX-NEXT: vpsrld $16, %xmm8, %xmm0
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
-; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6,6]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3]
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,6,6]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX-NEXT: vmovdqa 48(%rsi), %xmm3
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm13
-; AVX-NEXT: vpsrld $16, %xmm3, %xmm1
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3]
-; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 48(%rsi), %xmm4
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm9
+; AVX-NEXT: vpsrld $16, %xmm4, %xmm1
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
+; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,4]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
-; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0
-; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
+; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vmovdqa 48(%r9), %xmm10
-; AVX-NEXT: vmovdqa 48(%r8), %xmm9
+; AVX-NEXT: vmovdqa 48(%r9), %xmm2
+; AVX-NEXT: vmovdqa 48(%r8), %xmm3
; AVX-NEXT: vmovdqa 48(%rax), %xmm11
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa %xmm7, %xmm2
-; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm14[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm7[5],xmm1[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm2[0,2],xmm11[1,3]
-; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm14[0,2],xmm11[1,3]
+; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1
-; AVX-NEXT: vandps %ymm0, %ymm14, %ymm0
-; AVX-NEXT: vandnps %ymm1, %ymm14, %ymm1
-; AVX-NEXT: vmovaps %ymm14, %ymm5
+; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
+; AVX-NEXT: vandps %ymm7, %ymm0, %ymm0
+; AVX-NEXT: vandnps %ymm1, %ymm7, %ymm1
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
; AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
@@ -9604,20 +9554,20 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,2,2,2]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm7[6],xmm1[7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
-; AVX-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
+; AVX-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9]
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX-NEXT: vandnps %ymm0, %ymm3, %ymm0
-; AVX-NEXT: vandps %ymm3, %ymm1, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0
+; AVX-NEXT: vandps %ymm4, %ymm1, %ymm1
; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[2,2,3,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5,6,7]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm7 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpsrldq {{.*#+}} xmm7 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3],xmm7[4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1
; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
@@ -9626,14 +9576,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa 64(%rdx), %xmm2
-; AVX-NEXT: vmovdqa 64(%rcx), %xmm7
-; AVX-NEXT: vpsrld $16, %xmm7, %xmm0
+; AVX-NEXT: vmovdqa 64(%rcx), %xmm6
+; AVX-NEXT: vpsrld $16, %xmm6, %xmm0
; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; AVX-NEXT: vmovdqa %xmm7, %xmm0
-; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
+; AVX-NEXT: vmovdqa %xmm6, %xmm0
+; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,0,1,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,0,1,1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm7, %ymm1
; AVX-NEXT: vmovdqa 64(%rsi), %xmm8
; AVX-NEXT: vmovdqa 64(%rdi), %xmm9
@@ -9644,26 +9594,24 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm7, %ymm7
-; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX-NEXT: vandnps %ymm1, %ymm6, %ymm1
-; AVX-NEXT: vandps %ymm6, %ymm7, %ymm7
+; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX-NEXT: vandnps %ymm1, %ymm3, %ymm1
+; AVX-NEXT: vandps %ymm3, %ymm7, %ymm7
; AVX-NEXT: vorps %ymm1, %ymm7, %ymm10
; AVX-NEXT: vmovdqa 64(%r9), %xmm1
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa 64(%r8), %xmm6
-; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
+; AVX-NEXT: vmovdqa 64(%r8), %xmm3
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
; AVX-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX-NEXT: vmovdqa 64(%rax), %xmm6
; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5],xmm13[6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,1,0,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,0,0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7]
+; AVX-NEXT: vinsertps {{.*#+}} xmm14 = xmm14[0,1,2],xmm6[0]
; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
-; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX-NEXT: vandps %ymm1, %ymm10, %ymm10
-; AVX-NEXT: vandnps %ymm13, %ymm1, %ymm13
+; AVX-NEXT: vandps %ymm5, %ymm10, %ymm10
+; AVX-NEXT: vandnps %ymm13, %ymm5, %ymm13
; AVX-NEXT: vorps %ymm13, %ymm10, %ymm1
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpsrld $16, %xmm8, %xmm10
@@ -9676,7 +9624,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,4,4]
; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[2,2,2,2]
; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm10[6],xmm9[7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5,6,6]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5,6,6]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm0
; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
@@ -9684,7 +9632,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vorps %ymm0, %ymm8, %ymm0
; AVX-NEXT: vpsrldq {{.*#+}} xmm8 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[3],xmm8[4,5,6,7]
; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm6[1,3]
; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
@@ -9693,57 +9641,55 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vandnps %ymm7, %ymm1, %ymm7
; AVX-NEXT: vorps %ymm7, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 80(%rdx), %xmm1
-; AVX-NEXT: vmovdqa 80(%rcx), %xmm9
-; AVX-NEXT: vpsrld $16, %xmm9, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3]
-; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,5,6,6]
+; AVX-NEXT: vmovdqa 80(%rdx), %xmm9
+; AVX-NEXT: vmovdqa 80(%rcx), %xmm8
+; AVX-NEXT: vpsrld $16, %xmm8, %xmm0
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,5,6,6]
; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX-NEXT: vmovdqa 80(%rdi), %xmm8
-; AVX-NEXT: vmovdqa 80(%rsi), %xmm12
-; AVX-NEXT: vpsrld $16, %xmm12, %xmm7
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3]
-; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 80(%rdi), %xmm4
+; AVX-NEXT: vmovdqa 80(%rsi), %xmm5
+; AVX-NEXT: vpsrld $16, %xmm5, %xmm7
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill
; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,5,4]
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm7
-; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0
-; AVX-NEXT: vandps %ymm4, %ymm7, %ymm7
+; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0]
+; AVX-NEXT: vandnps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vandps %ymm1, %ymm7, %ymm7
; AVX-NEXT: vorps %ymm0, %ymm7, %ymm7
-; AVX-NEXT: vmovdqa 80(%r9), %xmm4
-; AVX-NEXT: vmovdqa 80(%r8), %xmm3
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; AVX-NEXT: vmovdqa 80(%r9), %xmm3
+; AVX-NEXT: vmovdqa 80(%r8), %xmm2
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa 80(%rax), %xmm2
-; AVX-NEXT: vmovdqa %xmm6, %xmm0
-; AVX-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm6[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,1,0,1]
+; AVX-NEXT: vmovdqa 80(%rax), %xmm1
+; AVX-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm10[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2,3,4],xmm6[5],xmm15[6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm0[0,2],xmm2[1,3]
-; AVX-NEXT: vmovdqa %xmm0, %xmm10
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm10[0,2],xmm1[1,3]
; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm6, %ymm6
-; AVX-NEXT: vandps %ymm5, %ymm7, %ymm7
-; AVX-NEXT: vandnps %ymm6, %ymm5, %ymm6
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535]
+; AVX-NEXT: vandps %ymm0, %ymm7, %ymm7
+; AVX-NEXT: vandnps %ymm6, %ymm0, %ymm6
; AVX-NEXT: vorps %ymm6, %ymm7, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,1,0,1]
; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
-; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,3,3,3,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,2,2,2]
+; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[2,2,2,2]
; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm15[6],xmm7[7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
; AVX-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9]
; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7
@@ -9751,76 +9697,75 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vandnps %ymm6, %ymm0, %ymm6
; AVX-NEXT: vandps %ymm0, %ymm7, %ymm7
; AVX-NEXT: vorps %ymm6, %ymm7, %ymm6
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,1]
-; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[2,2,3,3]
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[2,2,3,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,3],xmm7[4,5,6,7]
; AVX-NEXT: vpsrldq {{.*#+}} xmm15 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm2[3],xmm15[4,5,6,7]
+; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm1[3],xmm15[4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
; AVX-NEXT: vandps %ymm0, %ymm6, %ymm6
; AVX-NEXT: vandnps %ymm7, %ymm0, %ymm7
; AVX-NEXT: vorps %ymm7, %ymm6, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vpsrld $16, %xmm8, %xmm6
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX-NEXT: vpsrld $16, %xmm2, %xmm6
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[0,0,1,1]
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm15, %ymm7
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4]
; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
; AVX-NEXT: vandnps %ymm7, %ymm0, %ymm7
; AVX-NEXT: vandps %ymm0, %ymm14, %ymm14
-; AVX-NEXT: vorps %ymm7, %ymm14, %ymm14
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT: # xmm7 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
-; AVX-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm7[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT: vorps %ymm7, %ymm14, %ymm7
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm13[0],mem[0],xmm13[1],mem[1],xmm13[2],mem[2],xmm13[3],mem[3]
+; AVX-NEXT: vpslldq {{.*#+}} xmm15 = zero,zero,xmm14[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[0,1,0,1]
; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm15[0,1,2,3,4],xmm13[5],xmm15[6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,0,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[0,0,0,0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,5],xmm12[6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12
-; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX-NEXT: vandps %ymm3, %ymm14, %ymm13
-; AVX-NEXT: vandnps %ymm12, %ymm3, %ymm12
-; AVX-NEXT: vorps %ymm12, %ymm13, %ymm3
-; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpsrld $16, %xmm1, %xmm12
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm4[0],xmm12[0],xmm4[1],xmm12[1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,1]
+; AVX-NEXT: vinsertps {{.*#+}} xmm15 = xmm15[0,1,2],xmm0[0]
+; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm15, %ymm13
+; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX-NEXT: vandps %ymm4, %ymm7, %ymm7
+; AVX-NEXT: vandnps %ymm13, %ymm4, %ymm13
+; AVX-NEXT: vorps %ymm7, %ymm13, %ymm4
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vpsrld $16, %xmm1, %xmm7
+; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
; AVX-NEXT: # xmm13 = mem[0,1,0,1]
-; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12
-; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[3,3,3,3,4,5,6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7
+; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[3,3,3,3,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4]
-; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm2[2,2,2,2]
-; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6],xmm13[7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,2,2,2]
+; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm15[6],xmm13[7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,6]
; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6
-; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
-; AVX-NEXT: vandnps %ymm12, %ymm10, %ymm12
-; AVX-NEXT: vandps %ymm6, %ymm10, %ymm6
-; AVX-NEXT: vorps %ymm6, %ymm12, %ymm6
-; AVX-NEXT: vpsrldq {{.*#+}} xmm12 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm0[3],xmm12[4,5,6,7]
-; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[1,3]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7
+; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
+; AVX-NEXT: vandnps %ymm7, %ymm1, %ymm7
+; AVX-NEXT: vandps %ymm1, %ymm6, %ymm6
+; AVX-NEXT: vorps %ymm7, %ymm6, %ymm6
+; AVX-NEXT: vpsrldq {{.*#+}} xmm7 = xmm14[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3],xmm7[4,5,6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm14[0,2],xmm0[1,3]
+; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm7
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
; AVX-NEXT: vandps %ymm0, %ymm6, %ymm6
; AVX-NEXT: vandnps %ymm7, %ymm0, %ymm7
@@ -9833,152 +9778,150 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX-NEXT: vpslldq {{.*#+}} xmm7 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm2[2,2,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm7, %ymm7
-; AVX-NEXT: vandnps %ymm6, %ymm10, %ymm6
-; AVX-NEXT: vandps %ymm7, %ymm10, %ymm7
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[2,2,3,3]
+; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm7, %ymm7
+; AVX-NEXT: vandnps %ymm6, %ymm1, %ymm6
+; AVX-NEXT: vandps %ymm1, %ymm7, %ymm7
; AVX-NEXT: vorps %ymm6, %ymm7, %ymm6
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,4]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3,4,5,6,7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[2,2,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm12, %ymm12
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm13[1],xmm7[2,3,4,5,6,7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
+; AVX-NEXT: vinsertps {{.*#+}} xmm13 = xmm13[0],xmm4[2],xmm13[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm13, %ymm13
; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
; AVX-NEXT: vandnps %ymm6, %ymm5, %ymm6
-; AVX-NEXT: vandps %ymm5, %ymm12, %ymm12
-; AVX-NEXT: vorps %ymm6, %ymm12, %ymm4
-; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vandps %ymm5, %ymm13, %ymm13
+; AVX-NEXT: vorps %ymm6, %ymm13, %ymm1
+; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpsrlq $48, %xmm15, %xmm6
; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm14[1],xmm6[1]
-; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = mem[0,0,1,1]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm6, %ymm6
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm12 # 16-byte Folded Reload
-; AVX-NEXT: # xmm12 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm13 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX-NEXT: vandnps %ymm6, %ymm2, %ymm6
-; AVX-NEXT: vandps %ymm2, %ymm12, %ymm12
-; AVX-NEXT: vorps %ymm6, %ymm12, %ymm12
-; AVX-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; AVX-NEXT: # xmm13 = mem[0,0,1,1]
+; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload
+; AVX-NEXT: # xmm13 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm14 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
+; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
+; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX-NEXT: vandnps %ymm6, %ymm1, %ymm6
+; AVX-NEXT: vandps %ymm1, %ymm13, %ymm13
+; AVX-NEXT: vorps %ymm6, %ymm13, %ymm13
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
; AVX-NEXT: # xmm6 = mem[0,1,0,1]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,0,0,0]
-; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm6[0,1,2,3,4,5],xmm13[6,7]
+; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm12[0,0,0,0]
+; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm6[0,1,2],xmm14[3]
; AVX-NEXT: vpsrld $16, %xmm0, %xmm6
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
-; AVX-NEXT: vpshufb %xmm6, %xmm4, %xmm4
-; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm4, %ymm13
-; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX-NEXT: vandps %ymm8, %ymm12, %ymm12
-; AVX-NEXT: vandnps %ymm13, %ymm8, %ymm13
-; AVX-NEXT: vorps %ymm13, %ymm12, %ymm0
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
+; AVX-NEXT: vpshufb %xmm2, %xmm4, %xmm4
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm4, %ymm14
+; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX-NEXT: vandps %ymm6, %ymm13, %ymm13
+; AVX-NEXT: vandnps %ymm14, %ymm6, %ymm14
+; AVX-NEXT: vorps %ymm14, %ymm13, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpsrlq $48, %xmm0, %xmm12
+; AVX-NEXT: vpsrlq $48, %xmm0, %xmm13
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm0[1],xmm12[1]
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm0[1],xmm13[1]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
+; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm14 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13
+; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm15 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX-NEXT: vandnps %ymm12, %ymm0, %ymm12
-; AVX-NEXT: vandps %ymm0, %ymm13, %ymm13
-; AVX-NEXT: vorps %ymm12, %ymm13, %ymm12
+; AVX-NEXT: vandnps %ymm13, %ymm0, %ymm13
+; AVX-NEXT: vandps %ymm0, %ymm14, %ymm14
+; AVX-NEXT: vorps %ymm13, %ymm14, %ymm13
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm13
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm14
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3,4,5,6,7]
-; AVX-NEXT: vpshufb %xmm6, %xmm13, %xmm13
-; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3,4,5,6,7]
+; AVX-NEXT: vpshufb %xmm2, %xmm14, %xmm14
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm3, %ymm14
; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0]
-; AVX-NEXT: vandps %ymm7, %ymm12, %ymm12
-; AVX-NEXT: vandnps %ymm13, %ymm7, %ymm13
-; AVX-NEXT: vorps %ymm13, %ymm12, %ymm0
+; AVX-NEXT: vandps %ymm7, %ymm13, %ymm13
+; AVX-NEXT: vandnps %ymm14, %ymm7, %ymm14
+; AVX-NEXT: vorps %ymm14, %ymm13, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,xmm12[0,1,2,3,4,5,6,7,8,9]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13
-; AVX-NEXT: vandnps %ymm12, %ymm10, %ymm12
-; AVX-NEXT: vandps %ymm10, %ymm13, %ymm13
-; AVX-NEXT: vorps %ymm12, %ymm13, %ymm12
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,zero,zero,zero,zero,xmm13[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vpslldq {{.*#+}} xmm14 = zero,zero,xmm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[2,2,3,3]
+; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
+; AVX-NEXT: vandnps %ymm13, %ymm0, %ymm13
+; AVX-NEXT: vandps %ymm0, %ymm14, %ymm14
+; AVX-NEXT: vorps %ymm13, %ymm14, %ymm13
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm6[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm6[2,2,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm15[2,3],xmm14[4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
-; AVX-NEXT: vandnps %ymm12, %ymm5, %ymm12
-; AVX-NEXT: vandps %ymm5, %ymm13, %ymm13
-; AVX-NEXT: vorps %ymm12, %ymm13, %ymm5
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3,4,5,6,7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,1]
+; AVX-NEXT: vinsertps {{.*#+}} xmm15 = xmm15[0],xmm4[2],xmm15[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14
+; AVX-NEXT: vandnps %ymm13, %ymm5, %ymm13
+; AVX-NEXT: vandps %ymm5, %ymm14, %ymm14
+; AVX-NEXT: vorps %ymm13, %ymm14, %ymm5
; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpsrlq $48, %xmm2, %xmm12
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm12 = xmm3[1],xmm12[1]
-; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; AVX-NEXT: # xmm13 = mem[0,0,1,1]
-; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12
+; AVX-NEXT: vpsrlq $48, %xmm12, %xmm13
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm13 = xmm2[1],xmm13[1]
+; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[0,0,1,1]
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm13 # 16-byte Folded Reload
-; AVX-NEXT: # xmm13 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm14 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
-; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
+; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm15 = xmm3[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14
; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX-NEXT: vandnps %ymm12, %ymm2, %ymm12
-; AVX-NEXT: vandps %ymm2, %ymm13, %ymm13
-; AVX-NEXT: vorps %ymm12, %ymm13, %ymm12
-; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; AVX-NEXT: # xmm13 = mem[0,1,0,1]
-; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm11[0,0,0,0]
-; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3]
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm14
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm1[2],xmm14[2],xmm1[3],xmm14[3]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm14[4],xmm6[5],xmm14[5],xmm6[6],xmm14[6],xmm6[7],xmm14[7]
+; AVX-NEXT: vandnps %ymm13, %ymm2, %ymm13
+; AVX-NEXT: vandps %ymm2, %ymm14, %ymm14
+; AVX-NEXT: vorps %ymm13, %ymm14, %ymm13
+; AVX-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; AVX-NEXT: # xmm14 = mem[0,1,0,1]
+; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm11[0,0,0,0]
+; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3]
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm15
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm1[2],xmm15[2],xmm1[3],xmm15[3]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7]
; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [8,9,8,9,8,9,8,9,12,13,6,7,10,11,12,13]
; AVX-NEXT: vpshufb %xmm15, %xmm5, %xmm5
-; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm5, %ymm5
-; AVX-NEXT: vandps %ymm8, %ymm12, %ymm12
-; AVX-NEXT: vandnps %ymm5, %ymm8, %ymm5
-; AVX-NEXT: vorps %ymm5, %ymm12, %ymm0
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm5, %ymm5
+; AVX-NEXT: vandps %ymm6, %ymm13, %ymm13
+; AVX-NEXT: vandnps %ymm5, %ymm6, %ymm5
+; AVX-NEXT: vorps %ymm5, %ymm13, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vpsrlq $48, %xmm0, %xmm5
@@ -9987,68 +9930,68 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[2,2,3,3]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm13 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12
+; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,2,3,3]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm14 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
; AVX-NEXT: vandnps %ymm5, %ymm0, %ymm5
-; AVX-NEXT: vandps %ymm0, %ymm12, %ymm12
-; AVX-NEXT: vorps %ymm5, %ymm12, %ymm5
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpsrld $16, %xmm1, %xmm12
+; AVX-NEXT: vandps %ymm0, %ymm13, %ymm13
+; AVX-NEXT: vorps %ymm5, %ymm13, %ymm5
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
+; AVX-NEXT: vpsrld $16, %xmm0, %xmm13
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[2,2,2,2,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,4]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0],xmm2[1],xmm13[2,3,4,5,6,7]
-; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm12
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm2, %ymm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm14[0],xmm2[1],xmm14[2,3,4,5,6,7]
+; AVX-NEXT: vpshufb %xmm15, %xmm13, %xmm13
+; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm2, %ymm2
; AVX-NEXT: vandps %ymm7, %ymm5, %ymm5
; AVX-NEXT: vandnps %ymm2, %ymm7, %ymm2
; AVX-NEXT: vorps %ymm2, %ymm5, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
+; AVX-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm13, %ymm2
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT: vpslldq {{.*#+}} xmm12 = zero,zero,xmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm5[2,2,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm12, %ymm12
-; AVX-NEXT: vandnps %ymm2, %ymm10, %ymm2
-; AVX-NEXT: vandps %ymm10, %ymm12, %ymm12
-; AVX-NEXT: vorps %ymm2, %ymm12, %ymm2
+; AVX-NEXT: vpslldq {{.*#+}} xmm13 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,3,3]
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535]
+; AVX-NEXT: vandnps %ymm2, %ymm0, %ymm2
+; AVX-NEXT: vandps %ymm0, %ymm13, %ymm13
+; AVX-NEXT: vorps %ymm2, %ymm13, %ymm2
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[2,2,2,2,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,2,2,2,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,4]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[2,3,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[0,1,2,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,2,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm14[2,3],xmm13[4,5,6,7]
-; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
+; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[2,3,2,3]
+; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1],xmm13[2,3,4,5,6,7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[0,1,2,1]
+; AVX-NEXT: vinsertps {{.*#+}} xmm14 = xmm14[0],xmm0[2],xmm14[2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13
; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,65535,65535,0,0,0,0,65535,65535,65535,0,0,0,0,65535,65535]
; AVX-NEXT: vandnps %ymm2, %ymm14, %ymm2
-; AVX-NEXT: vandps %ymm14, %ymm12, %ymm12
-; AVX-NEXT: vorps %ymm2, %ymm12, %ymm2
-; AVX-NEXT: vpsrlq $48, %xmm4, %xmm11
+; AVX-NEXT: vandps %ymm14, %ymm13, %ymm13
+; AVX-NEXT: vorps %ymm2, %ymm13, %ymm2
+; AVX-NEXT: vpsrlq $48, %xmm11, %xmm11
; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm11 = xmm3[1],xmm11[1]
; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
; AVX-NEXT: # xmm12 = mem[0,0,1,1]
; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload
; AVX-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm12 # 16-byte Folded Reload
; AVX-NEXT: # xmm12 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3]
-; AVX-NEXT: vpsrldq {{.*#+}} xmm13 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT: vpsrldq {{.*#+}} xmm13 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,1]
; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
@@ -10056,22 +9999,20 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vandnps %ymm11, %ymm3, %ymm11
; AVX-NEXT: vandps %ymm3, %ymm12, %ymm12
; AVX-NEXT: vorps %ymm11, %ymm12, %ymm11
-; AVX-NEXT: vpshufd $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; AVX-NEXT: # xmm10 = mem[0,1,0,1]
-; AVX-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload
-; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[0,0,0,0]
+; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,1]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[0,0,0,0]
; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5],xmm12[6,7]
; AVX-NEXT: vpsrld $16, %xmm1, %xmm12
-; AVX-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; AVX-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm5[2],xmm12[2],xmm5[3],xmm12[3]
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1
-; AVX-NEXT: vandps %ymm8, %ymm11, %ymm10
-; AVX-NEXT: vandnps %ymm1, %ymm8, %ymm1
+; AVX-NEXT: vandps %ymm6, %ymm11, %ymm10
+; AVX-NEXT: vandnps %ymm1, %ymm6, %ymm1
; AVX-NEXT: vorps %ymm1, %ymm10, %ymm1
-; AVX-NEXT: vpsrlq $48, %xmm9, %xmm4
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm0[1],xmm4[1]
+; AVX-NEXT: vpsrlq $48, %xmm8, %xmm4
+; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm9[1],xmm4[1]
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -10086,12 +10027,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpsrld $16, %xmm3, %xmm8
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
-; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
+; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
; AVX-NEXT: vpshufb %xmm15, %xmm8, %xmm6
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,2,2,2,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,3,2,3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,3,2,3]
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm0[1],xmm8[2,3,4,5,6,7]
; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm0
; AVX-NEXT: vandps %ymm7, %ymm4, %ymm4
@@ -10121,8 +10062,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,3,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6,7]
+; AVX-NEXT: vinsertps {{.*#+}} xmm6 = xmm6[0],xmm5[2],xmm6[2,3]
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
; AVX-NEXT: vandnps %ymm3, %ymm14, %ymm3
; AVX-NEXT: vandps %ymm4, %ymm14, %ymm4
@@ -10188,7 +10128,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovaps %xmm0, 848(%rax)
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, 832(%rax)
-; AVX-NEXT: addq $1496, %rsp # imm = 0x5D8
+; AVX-NEXT: addq $1448, %rsp # imm = 0x5A8
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -12522,2661 +12462,2631 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX512-LABEL: store_i16_stride7_vf64:
; AVX512: # %bb.0:
-; AVX512-NEXT: subq $2840, %rsp # imm = 0xB18
-; AVX512-NEXT: vmovdqa 96(%rcx), %ymm6
-; AVX512-NEXT: vmovdqa 96(%rdx), %ymm13
-; AVX512-NEXT: vmovdqa 96(%rdi), %ymm7
-; AVX512-NEXT: vmovdqa 96(%rsi), %ymm8
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm0, %ymm6, %ymm2
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb %ymm1, %ymm13, %ymm3
-; AVX512-NEXT: vporq %ymm2, %ymm3, %ymm17
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm10, %ymm8, %ymm2
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-NEXT: vpshufb %ymm11, %ymm7, %ymm3
-; AVX512-NEXT: vporq %ymm2, %ymm3, %ymm18
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-NEXT: vmovdqa 64(%r9), %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa 64(%r8), %ymm3
-; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa 64(%rcx), %ymm3
-; AVX512-NEXT: vpshufb %ymm0, %ymm3, %ymm2
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm26
-; AVX512-NEXT: vmovdqa 64(%rdx), %ymm4
-; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm3
-; AVX512-NEXT: vmovdqa64 %ymm4, %ymm27
-; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa 64(%rsi), %ymm3
-; AVX512-NEXT: vpshufb %ymm10, %ymm3, %ymm2
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm23
-; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4
-; AVX512-NEXT: vpshufb %ymm11, %ymm4, %ymm3
-; AVX512-NEXT: vmovdqa64 %ymm4, %ymm22
-; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa (%r9), %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa (%r8), %ymm3
-; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa (%rcx), %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa (%rdx), %ymm3
-; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3
-; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa (%rsi), %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufb %ymm10, %ymm2, %ymm2
-; AVX512-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm3
-; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa 32(%rcx), %ymm4
-; AVX512-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512-NEXT: vmovdqa 32(%rdx), %ymm2
-; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 32(%rsi), %ymm5
-; AVX512-NEXT: vpshufb %ymm10, %ymm5, %ymm0
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm1
-; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 32(%r8), %ymm1
-; AVX512-NEXT: vpshufb %ymm9, %ymm1, %ymm9
-; AVX512-NEXT: vmovdqa 32(%r9), %ymm0
-; AVX512-NEXT: vpshufb %ymm12, %ymm0, %ymm10
-; AVX512-NEXT: vpor %ymm10, %ymm9, %ymm9
-; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm10, %ymm6, %ymm9
-; AVX512-NEXT: vmovdqa64 %ymm10, %ymm31
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
-; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa 96(%r9), %ymm9
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3]
-; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm10[2,2,2,2]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512-NEXT: vprold $16, %ymm9, %ymm8
-; AVX512-NEXT: vpshufb %ymm12, %ymm9, %ymm9
-; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm6
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (mem & (zmm7 ^ zmm6))
-; AVX512-NEXT: vmovdqa 96(%r8), %ymm6
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7]
-; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ymm11)
-; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm11)
-; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm6
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3]
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm10 & (ymm7 ^ ymm6))
-; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2]
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm8 & (ymm6 ^ ymm7))
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3]
-; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512-NEXT: vmovdqa 96(%rax), %ymm6
-; AVX512-NEXT: vpermd %zmm6, %zmm18, %zmm7
-; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-NEXT: vpshufb %ymm11, %ymm6, %ymm6
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX512-NEXT: vpandn %ymm7, %ymm12, %ymm7
-; AVX512-NEXT: vmovdqa64 %zmm12, %zmm19
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-NEXT: vpbroadcastd 72(%rax), %ymm6
-; AVX512-NEXT: vpandnq %ymm6, %ymm28, %ymm6
-; AVX512-NEXT: vmovdqa 64(%rax), %ymm7
-; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufb %ymm11, %ymm7, %ymm7
-; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 64(%r9), %xmm7
-; AVX512-NEXT: vmovdqa 64(%r8), %xmm8
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512-NEXT: vmovdqa64 %xmm8, %xmm17
-; AVX512-NEXT: vmovdqa64 %xmm7, %xmm20
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-NEXT: vpshufb %xmm15, %xmm6, %xmm6
-; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 64(%rcx), %xmm9
-; AVX512-NEXT: vmovdqa 64(%rdx), %xmm7
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 64(%rdi), %xmm14
-; AVX512-NEXT: vmovdqa 64(%rsi), %xmm10
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
-; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
-; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm8
-; AVX512-NEXT: vpandnq %ymm8, %ymm28, %ymm8
-; AVX512-NEXT: vmovdqa (%rax), %ymm12
-; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufb %ymm11, %ymm12, %ymm13
-; AVX512-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8
-; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa (%r9), %xmm6
-; AVX512-NEXT: vmovdqa (%r8), %xmm12
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512-NEXT: vmovdqa64 %xmm12, %xmm29
-; AVX512-NEXT: vmovdqa64 %xmm6, %xmm24
-; AVX512-NEXT: vpshufb %xmm15, %xmm8, %xmm8
-; AVX512-NEXT: vmovdqa64 %xmm15, %xmm25
-; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
-; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa (%rcx), %xmm8
-; AVX512-NEXT: vmovdqa (%rdx), %xmm13
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,0,2,1]
-; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa (%rdi), %xmm6
-; AVX512-NEXT: vmovdqa (%rsi), %xmm12
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512-NEXT: vmovdqa64 %xmm12, %xmm21
-; AVX512-NEXT: vmovdqa64 %xmm6, %xmm16
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
-; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,0,1,3]
-; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2]
-; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa 32(%rax), %ymm15
-; AVX512-NEXT: vpshufb %ymm11, %ymm15, %ymm11
-; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
-; AVX512-NEXT: vpandnq %ymm12, %ymm19, %ymm12
-; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
-; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3]
-; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm31, %ymm6
-; AVX512-NEXT: vpshufb %ymm6, %ymm4, %ymm11
-; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
-; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT: subq $2648, %rsp # imm = 0xA58
+; AVX512-NEXT: vmovdqa 96(%rcx), %ymm3
+; AVX512-NEXT: vmovdqa 96(%rdx), %ymm2
+; AVX512-NEXT: vmovdqa 96(%rdi), %ymm1
+; AVX512-NEXT: vmovdqa 96(%rsi), %ymm0
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpshufb %ymm8, %ymm3, %ymm4
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm5
+; AVX512-NEXT: vporq %ymm4, %ymm5, %ymm22
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-NEXT: vpshufb %ymm12, %ymm0, %ymm4
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512-NEXT: vpshufb %ymm14, %ymm1, %ymm5
+; AVX512-NEXT: vporq %ymm4, %ymm5, %ymm23
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-NEXT: vmovdqa 64(%r9), %ymm4
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufb %ymm7, %ymm4, %ymm4
+; AVX512-NEXT: vmovdqa 64(%r8), %ymm5
+; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5
+; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa 64(%rcx), %ymm5
+; AVX512-NEXT: vpshufb %ymm8, %ymm5, %ymm4
+; AVX512-NEXT: vmovdqa64 %ymm5, %ymm29
+; AVX512-NEXT: vmovdqa 64(%rdx), %ymm6
+; AVX512-NEXT: vpshufb %ymm9, %ymm6, %ymm5
+; AVX512-NEXT: vmovdqa64 %ymm6, %ymm31
+; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa 64(%rsi), %ymm5
+; AVX512-NEXT: vpshufb %ymm12, %ymm5, %ymm4
+; AVX512-NEXT: vmovdqa64 %ymm5, %ymm20
+; AVX512-NEXT: vmovdqa 64(%rdi), %ymm6
+; AVX512-NEXT: vpshufb %ymm14, %ymm6, %ymm5
+; AVX512-NEXT: vmovdqa64 %ymm6, %ymm21
+; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa (%r9), %ymm4
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufb %ymm7, %ymm4, %ymm4
+; AVX512-NEXT: vmovdqa (%r8), %ymm5
+; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufb %ymm11, %ymm5, %ymm5
+; AVX512-NEXT: vmovdqa64 %ymm11, %ymm27
+; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa (%rcx), %ymm4
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufb %ymm8, %ymm4, %ymm4
+; AVX512-NEXT: vmovdqa (%rdx), %ymm5
+; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufb %ymm9, %ymm5, %ymm5
+; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa (%rsi), %ymm4
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufb %ymm12, %ymm4, %ymm4
+; AVX512-NEXT: vmovdqa (%rdi), %ymm5
+; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufb %ymm14, %ymm5, %ymm5
+; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa 32(%rcx), %ymm11
+; AVX512-NEXT: vpshufb %ymm8, %ymm11, %ymm4
+; AVX512-NEXT: vmovdqa 32(%rdx), %ymm8
+; AVX512-NEXT: vpshufb %ymm9, %ymm8, %ymm5
+; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa 32(%rsi), %ymm10
+; AVX512-NEXT: vpshufb %ymm12, %ymm10, %ymm4
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512-NEXT: vpshufb %ymm14, %ymm12, %ymm5
+; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa 32(%r8), %ymm15
+; AVX512-NEXT: vmovdqa64 %ymm27, %ymm4
+; AVX512-NEXT: vpshufb %ymm4, %ymm15, %ymm4
+; AVX512-NEXT: vmovdqa 32(%r9), %ymm13
+; AVX512-NEXT: vpshufb %ymm7, %ymm13, %ymm5
+; AVX512-NEXT: vpor %ymm5, %ymm4, %ymm4
+; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512-NEXT: # ymm14 = mem[0,1,0,1]
+; AVX512-NEXT: vpshufb %ymm14, %ymm3, %ymm4
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vprold $16, %ymm0, %ymm2
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa 96(%r9), %ymm4
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512-NEXT: vpermq {{.*#+}} ymm27 = ymm5[3,3,3,3]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[2,1,2,3,6,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm5[2,2,2,2]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
-; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpermd %zmm15, %zmm18, %zmm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm23, %ymm12
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,1,5,5,5,5]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512-NEXT: vprold $16, %ymm4, %ymm1
+; AVX512-NEXT: vpshufb %ymm7, %ymm4, %ymm3
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm22, %zmm2
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm23, %zmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (mem & (zmm0 ^ zmm2))
+; AVX512-NEXT: vmovdqa 96(%r8), %ymm2
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[1,2,2,3,5,6,6,7]
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,0,2,1,4,4,6,5]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17,u,u,u,u],zero,zero
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm5)
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm30)
+; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm4[2,1,3,3]
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm4 & (ymm2 ^ ymm0))
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,2]
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm26 & (ymm0 ^ ymm2))
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[0,1,2,3]
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm26, %ymm2
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-NEXT: vpbroadcastd 72(%rax), %ymm0
+; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512-NEXT: vmovdqa 64(%rax), %ymm2
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512-NEXT: vpshufb %ymm3, %ymm2, %ymm1
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16
+; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufb %ymm6, %ymm2, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm26, %ymm3
-; AVX512-NEXT: vmovdqa64 %ymm31, %ymm11
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; AVX512-NEXT: vmovdqa 64(%r9), %xmm1
+; AVX512-NEXT: vmovdqa 64(%r8), %xmm2
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-NEXT: vmovdqa64 %xmm2, %xmm22
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm19
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vprold $16, %ymm5, %ymm0
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[1,2,2,3,5,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512-NEXT: vmovdqa 64(%rcx), %xmm5
+; AVX512-NEXT: vmovdqa 64(%rdx), %xmm7
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,0,2,1]
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 64(%rsi), %xmm2
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm17
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,0,1,3]
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa (%rax), %ymm0
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3
+; AVX512-NEXT: vpbroadcastd 8(%rax), %ymm4
+; AVX512-NEXT: vpandn %ymm4, %ymm6, %ymm4
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 96(%rcx), %xmm0
-; AVX512-NEXT: vmovdqa 96(%rdx), %xmm1
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 96(%rsi), %xmm2
-; AVX512-NEXT: vmovdqa 96(%rdi), %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm12
-; AVX512-NEXT: vpshufb %xmm12, %xmm4, %xmm4
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
-; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vprold $16, %xmm2, %xmm4
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
-; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT: vmovdqa (%r9), %xmm0
+; AVX512-NEXT: vmovdqa (%r8), %xmm1
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm23
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm24
+; AVX512-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX512-NEXT: vmovdqa64 %xmm9, %xmm25
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,0,1,1]
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512-NEXT: vmovdqa (%rdx), %xmm4
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm6[0,0,2,1]
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa (%rsi), %xmm6
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; AVX512-NEXT: vmovdqa64 %xmm0, %xmm18
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm9[0,0,1,3]
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa %ymm14, %ymm1
+; AVX512-NEXT: vpshufb %ymm14, %ymm11, %ymm9
+; AVX512-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3]
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm9[0,2,2,3]
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm9[0,2,2,3]
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3],ymm14[4,5],ymm9[6],ymm14[7,8,9,10],ymm9[11],ymm14[12,13],ymm9[14],ymm14[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,1,3,3]
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vprold $16, %ymm13, %ymm9
+; AVX512-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[1,2,2,3,5,6,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7,8,9],ymm9[10],ymm14[11,12],ymm9[13],ymm14[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,1,3,2]
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,3,6,6,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3]
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa64 %ymm20, %ymm12
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[1,1,1,1,5,5,5,5]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3]
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,2,2,3]
+; AVX512-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa64 %ymm29, %ymm8
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7,8,9],ymm9[10],ymm14[11,12],ymm9[13],ymm14[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2]
+; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufb %ymm1, %ymm8, %ymm9
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm29
+; AVX512-NEXT: vmovdqa %ymm8, %ymm10
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,2,2,3]
+; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm20[0,0,2,1,4,4,6,5]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3],ymm9[4,5],ymm14[6],ymm9[7,8,9,10],ymm14[11],ymm9[12,13],ymm14[14],ymm9[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,2,3,3]
+; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vprold $16, %ymm8, %ymm9
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm20[1,2,2,3,5,6,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7,8,9],ymm9[10],ymm14[11,12],ymm9[13],ymm14[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,1,3,2]
+; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm31[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm14 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3],ymm14[4],ymm9[5,6,7,8],ymm14[9],ymm9[10,11],ymm14[12],ymm9[13,14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,2,2,3]
+; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa 96(%rcx), %xmm9
+; AVX512-NEXT: vmovdqa 96(%rdx), %xmm14
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,3,2,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,0,1,1]
+; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7,8,9,10],ymm12[11],ymm15[12,13],ymm12[14],ymm15[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,3,3]
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa 96(%rsi), %xmm12
+; AVX512-NEXT: vmovdqa 96(%rdi), %xmm15
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm10
+; AVX512-NEXT: vpshufb %xmm10, %xmm11, %xmm11
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,0,1,1]
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm20[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,3,6,6,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7,8,9],ymm11[10],ymm13[11,12],ymm11[13],ymm13[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,2,2,3]
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vprold $16, %xmm12, %xmm11
+; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[1,1,2,3]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa 96(%r9), %xmm0
-; AVX512-NEXT: vmovdqa 96(%r8), %xmm1
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6]
-; AVX512-NEXT: vpermt2d %zmm1, %zmm18, %zmm0
-; AVX512-NEXT: vpbroadcastd 100(%rax), %ymm1
-; AVX512-NEXT: vpbroadcastd 104(%rax), %ymm2
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm0))
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; AVX512-NEXT: vpshufb %xmm6, %xmm9, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
-; AVX512-NEXT: vpermt2d %zmm0, %zmm2, %zmm1
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
-; AVX512-NEXT: vprold $16, %xmm10, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,3]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm1 ^ (zmm0 & (zmm14 ^ zmm1))
-; AVX512-NEXT: vmovdqa64 %xmm17, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
-; AVX512-NEXT: vpermt2d %zmm3, %zmm4, %zmm1
-; AVX512-NEXT: vpbroadcastd 64(%rax), %ymm3
-; AVX512-NEXT: vpbroadcastd 68(%rax), %ymm5
-; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm25
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm19 & (zmm25 ^ zmm1))
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
-; AVX512-NEXT: vpshufb %xmm6, %xmm8, %xmm3
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm3
-; AVX512-NEXT: vmovdqa64 %xmm16, %xmm5
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm2
+; AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm11
+; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7]
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa 96(%r9), %xmm11
+; AVX512-NEXT: vmovdqa 96(%r8), %xmm12
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,1,3,2,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,1,1,3]
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[2,3,3,3,6,7,7,7]
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,4,5,7,6]
+; AVX512-NEXT: vmovdqa64 %xmm25, %xmm14
+; AVX512-NEXT: vpshufb %xmm14, %xmm8, %xmm11
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
+; AVX512-NEXT: vpermt2d %zmm9, %zmm8, %zmm11
+; AVX512-NEXT: vpbroadcastd 100(%rax), %ymm9
+; AVX512-NEXT: vpbroadcastd 104(%rax), %ymm12
+; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm21
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm26 & (zmm21 ^ zmm11))
+; AVX512-NEXT: vmovdqa64 %zmm26, %zmm25
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX512-NEXT: vmovdqa %xmm1, %xmm12
+; AVX512-NEXT: vpshufb %xmm1, %xmm5, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,2,4,5,6,7]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
+; AVX512-NEXT: vpermt2d %zmm1, %zmm9, %zmm0
+; AVX512-NEXT: vmovdqa64 %xmm17, %xmm5
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm14, %xmm1, %xmm1
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX512-NEXT: vprold $16, %xmm21, %xmm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[1,1,2,3]
+; AVX512-NEXT: vprold $16, %xmm2, %xmm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm17[1,1,2,3]
; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm0 & (zmm5 ^ zmm3))
-; AVX512-NEXT: vmovdqa64 %xmm29, %xmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm2
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm1 & (zmm2 ^ zmm0))
+; AVX512-NEXT: vmovdqa64 %xmm22, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm5
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,7,6]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
+; AVX512-NEXT: vpermt2d %zmm5, %zmm11, %zmm0
+; AVX512-NEXT: vpbroadcastd 64(%rax), %ymm5
+; AVX512-NEXT: vpbroadcastd 68(%rax), %ymm10
+; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm20
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm30 & (zmm20 ^ zmm0))
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512-NEXT: vpermt2d %zmm0, %zmm9, %zmm3
+; AVX512-NEXT: vmovdqa64 %xmm18, %xmm5
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512-NEXT: vpshufb %xmm14, %xmm0, %xmm0
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512-NEXT: vprold $16, %xmm6, %xmm4
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm18[1,1,2,3]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm10
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm3 ^ (zmm1 & (zmm10 ^ zmm3))
+; AVX512-NEXT: vmovdqa64 %xmm23, %xmm0
; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512-NEXT: vpermt2d %zmm1, %zmm4, %zmm0
+; AVX512-NEXT: vpermt2d %zmm1, %zmm11, %zmm0
; AVX512-NEXT: vpbroadcastd (%rax), %ymm1
-; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm2
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm19 & (zmm20 ^ zmm0))
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT: vpbroadcastd 4(%rax), %ymm3
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm31
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm30 & (zmm31 ^ zmm0))
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5]
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,1,1,1,5,5,5,5]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm11[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,2,2,2,6,6,6,6]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,5,7]
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufb %ymm11, %ymm9, %ymm0
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vmovdqa %ymm3, %ymm11
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT: vmovdqa64 %ymm29, %ymm0
+; AVX512-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm0[2,2,2,3]
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512-NEXT: vprold $16, %ymm7, %ymm0
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[1,2,2,3,5,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7,8,9,10],ymm6[11],ymm0[12,13],ymm6[14],ymm0[15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
-; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
-; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm3[2,2,3,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm4[2,1,3,2]
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm9
-; AVX512-NEXT: vmovdqa 32(%rsi), %xmm10
-; AVX512-NEXT: vprold $16, %xmm10, %xmm1
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm2[0,2,2,3]
-; AVX512-NEXT: vmovdqa 32(%r9), %xmm1
-; AVX512-NEXT: vmovdqa 32(%r8), %xmm2
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512-NEXT: vpshufb %xmm12, %xmm2, %xmm3
-; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512-NEXT: vmovdqa64 %xmm6, %xmm27
-; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm1
-; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm2
-; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm4
-; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm1))
-; AVX512-NEXT: vmovdqa 32(%rcx), %xmm7
+; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,0,2,1,4,4,6,5]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,2,3,3]
+; AVX512-NEXT: vprold $16, %ymm4, %ymm0
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[1,2,2,3,5,6,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,1,3,2]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15]
+; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm0[0,2,2,3]
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm13
+; AVX512-NEXT: vmovdqa 32(%rsi), %xmm12
+; AVX512-NEXT: vprold $16, %xmm12, %xmm0
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,2,3]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa 32(%r9), %xmm0
+; AVX512-NEXT: vmovdqa 32(%r8), %xmm1
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX512-NEXT: vmovdqa %xmm14, %xmm4
+; AVX512-NEXT: vpshufb %xmm14, %xmm1, %xmm14
+; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5,7,6]
+; AVX512-NEXT: vpermt2d %zmm1, %zmm8, %zmm0
+; AVX512-NEXT: vpbroadcastd 36(%rax), %ymm1
+; AVX512-NEXT: vpbroadcastd 40(%rax), %ymm6
+; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm30
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm25 & (zmm30 ^ zmm0))
+; AVX512-NEXT: vmovdqa 32(%rcx), %xmm11
; AVX512-NEXT: vmovdqa 32(%rdx), %xmm6
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1]
-; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1]
+; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm18 = xmm8[0,0,1,1]
+; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm3[2,1,3,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1]
+; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm5[2,2,2,3]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,2,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm22 = xmm3[0,1,1,3]
+; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm23 = mem[2,3,3,3,6,7,7,7]
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm3))
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm8 & (zmm7 ^ zmm3))
; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512-NEXT: # ymm3 = mem[2,1,3,2]
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm29 & (ymm3 ^ ymm1))
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm28 & (ymm30 ^ ymm3))
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm4 & (ymm3 ^ ymm7))
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm28 = ymm28 ^ (ymm16 & (ymm28 ^ ymm3))
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
-; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm3 = (zmm3 & zmm28) | mem
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
+; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm5 = (zmm5 & zmm16) | mem
+; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm16 # 32-byte Folded Reload
+; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm16 = (zmm16 & zmm0) | mem
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm0 = (zmm0 & zmm28) | mem
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm23 ^ (zmm28 & (zmm18 ^ zmm23))
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm3 ^ (zmm2 & (zmm18 ^ zmm3))
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm3 ^ (zmm28 & (zmm23 ^ zmm3))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm23 = zmm0 ^ (zmm2 & (zmm23 ^ zmm0))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm17 # 32-byte Folded Reload
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm0 ^ (zmm25 & (zmm17 ^ zmm0))
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm5 ^ (zmm1 & (zmm17 ^ zmm5))
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm0 ^ (mem & (zmm17 ^ zmm0))
-; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm0
-; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm19 & (ymm0 ^ mem))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 32-byte Folded Reload
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm0 ^ (zmm25 & (zmm5 ^ zmm0))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm16 ^ (zmm1 & (zmm5 ^ zmm16))
+; AVX512-NEXT: vextracti64x4 $1, %zmm7, %ymm0
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm1 & (ymm0 ^ mem))
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm1 & (ymm2 ^ ymm0))
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm19 = zmm19 | (zmm1 & mem)
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm27 = ymm27 ^ (ymm1 & (ymm27 ^ ymm0))
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm28[0,1,2,3],zmm27[0,1,2,3]
+; AVX512-NEXT: vmovdqa64 64(%rax), %zmm16
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15]
+; AVX512-NEXT: vpermd %zmm16, %zmm1, %zmm28
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm3 & (zmm28 ^ zmm0))
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3]
-; AVX512-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm0 = (zmm0 & zmm1) | mem
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm2[0,1,2,3]
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm1))
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm2 & (zmm11 ^ zmm1))
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm1 & (zmm25 ^ zmm14))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm1 & (zmm20 ^ zmm5))
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm5 & (zmm2 ^ zmm1))
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT: vmovdqa64 (%rax), %zmm27
+; AVX512-NEXT: vpermd %zmm27, %zmm1, %zmm7
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm0))
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm0 & (zmm20 ^ zmm2))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm0 & (zmm31 ^ zmm10))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm22
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm5 & (zmm22 ^ zmm1))
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512-NEXT: vpermd 64(%rax), %zmm14, %zmm5
+; AVX512-NEXT: vinserti64x4 $1, %ymm29, %zmm3, %zmm29
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm29 = zmm0 ^ (zmm2 & (zmm29 ^ zmm0))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512-NEXT: vpermd %zmm16, %zmm10, %zmm2
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm1))
-; AVX512-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm1
-; AVX512-NEXT: vpermd (%rax), %zmm14, %zmm14
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm3 & (zmm14 ^ zmm1))
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm2))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm1 & (zmm14 ^ zmm22))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm0))
+; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm15, %zmm0
+; AVX512-NEXT: vpermd %zmm27, %zmm10, %zmm10
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm3 & (zmm10 ^ zmm0))
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm1))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm0 & (zmm10 ^ zmm29))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 32-byte Folded Reload
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm8 & (zmm9 ^ zmm0))
+; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm0 = mem[0,1,1,3]
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm4 & (zmm2 ^ zmm1))
-; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm1 = mem[0,1,1,3]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm8 & (zmm1 ^ zmm0))
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm18[0,1,1,3]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm18
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm0 ^ (zmm8 & (zmm18 ^ zmm0))
+; AVX512-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm0 = mem[0,0,0,1]
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm4 & (zmm22 ^ zmm1))
-; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3]
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm8
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm1 ^ (zmm4 & (zmm8 ^ zmm1))
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
-; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7]
-; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm7 = mem[2,2,2,3]
-; AVX512-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm9 = mem[0,1,3,2,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,3]
-; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm10 = mem[2,3,3,3,6,7,7,7]
-; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm16 = mem[0,0,2,1]
-; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm12 = mem[2,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
-; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
-; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm21 = mem[0,0,1,1]
-; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512-NEXT: # xmm15 = mem[0,2,3,3,4,5,6,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
-; AVX512-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm24 = mem[2,2,2,3]
-; AVX512-NEXT: vmovdqa64 %xmm27, %xmm3
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
-; AVX512-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm26 = mem[2,3,3,3,6,7,7,7]
-; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm27 = mem[0,0,2,1]
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
-; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm8 = mem[2,1,3,2]
+; AVX512-NEXT: vpbroadcastd 96(%rax), %ymm14
+; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm8, %zmm14
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm4 & (zmm14 ^ zmm0))
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm22[0,0,0,1]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm0
+; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm23[2,1,3,2]
+; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm19
+; AVX512-NEXT: vinserti64x4 $1, %ymm19, %zmm8, %zmm8
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm4 & (zmm8 ^ zmm0))
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm1))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm18))
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[1,1,2,2]
+; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1],xmm12[2,3],xmm1[4],xmm12[5,6],xmm1[7]
+; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm12 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
+; AVX512-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm12 = mem[1,1,1,1,5,5,5,5]
+; AVX512-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm13 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
+; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm13 = mem[0,0,2,1,4,4,6,5]
+; AVX512-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm15 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7,8,9,10],ymm13[11],ymm15[12,13],ymm13[14],ymm15[15]
+; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm15 = mem[0,0,2,1]
+; AVX512-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm4 = mem[2,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
+; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3]
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4
+; AVX512-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm15 = mem[0,0,1,1]
+; AVX512-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX512-NEXT: # xmm3 = mem[0,2,3,3,4,5,6,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm18 = mem[0,0,2,1]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
-; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2]
-; AVX512-NEXT: vpbroadcastd 96(%rax), %ymm10
-; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm29 & (zmm9 ^ zmm7))
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3
-; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2]
-; AVX512-NEXT: vpbroadcastd 32(%rax), %ymm10
-; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm29 & (zmm7 ^ zmm3))
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm3 & (zmm9 ^ zmm22))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm8))
-; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm3
-; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm8
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm28 & (zmm8 ^ zmm3))
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm27, %zmm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm3
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm28 & (zmm3 ^ zmm1))
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm1 & (zmm31 ^ zmm8))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm3))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm17))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm2))
+; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
+; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3]
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm4 ^ (zmm25 & (zmm3 ^ zmm4))
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm18, %zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm25 & (zmm1 ^ zmm0))
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm0 & (zmm21 ^ zmm3))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm0 & (zmm30 ^ zmm1))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0))
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [11,0,0,11,0,0,0,12,8,9,12,13,12,13,13,15]
+; AVX512-NEXT: vpermd %zmm16, %zmm0, %zmm3
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm3 = zmm3 ^ (zmm6 & (zmm3 ^ mem))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm13[0,1,2,3]
+; AVX512-NEXT: vpermd %zmm27, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm4))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm9))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm13, 256(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm7, 192(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm14, 128(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm23, 64(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm20, (%rax)
-; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm31, 704(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm9, 640(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm5, 576(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm18, 512(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm11, 384(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm19, 768(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm30, 832(%rax)
-; AVX512-NEXT: addq $2840, %rsp # imm = 0xB18
+; AVX512-NEXT: vmovdqa64 %zmm30, 256(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm8, 192(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm31, (%rax)
+; AVX512-NEXT: vmovdqa64 %zmm20, 448(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm21, 704(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm14, 640(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm2, 576(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm17, 512(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm3, 768(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm28, 832(%rax)
+; AVX512-NEXT: addq $2648, %rsp # imm = 0xA58
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
;
; AVX512-FCP-LABEL: store_i16_stride7_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $1576, %rsp # imm = 0x628
-; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm4
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0
-; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm1
-; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm16
+; AVX512-FCP-NEXT: subq $1288, %rsp # imm = 0x508
+; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm13
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm0
+; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm1
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm2
+; AVX512-FCP-NEXT: vporq %ymm0, %ymm2, %ymm16
; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm2
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm0
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm3
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm0
; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm7
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
-; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm13
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm26
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm28
-; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm15
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm27
-; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm23
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm29
-; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm0
-; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14
-; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm0
-; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm15
-; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm0
-; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm0
-; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm15
-; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15
-; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14
-; AVX512-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm15
-; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm8
-; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm13
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm9
-; AVX512-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm9
-; AVX512-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
-; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm8
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm10
-; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm9
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm11
-; AVX512-FCP-NEXT: vporq %ymm11, %ymm10, %ymm20
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2]
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT: # ymm14 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm12
-; AVX512-FCP-NEXT: vmovdqa64 %ymm14, %ymm17
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm16, %zmm10
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm10 ^ (mem & (zmm12 ^ zmm10))
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm12 & ymm14)
-; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm11
-; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm6
-; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm10 & ymm16)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm21
-; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm10
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0]
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm12
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm10))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm18
-; AVX512-FCP-NEXT: vprold $16, %ymm11, %ymm10
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm16 & (ymm10 ^ ymm12))
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7]
-; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm6
-; AVX512-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm10
-; AVX512-FCP-NEXT: vpandn %ymm10, %ymm14, %ymm10
-; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm16
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm12
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10
-; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm10
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
-; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm12 = [151522058,0,421010202,421010202]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm24
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,2,3,10,9,11,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm3, %zmm2
-; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm12 = [218894094,0,488382238,488382238]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm10
-; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm22
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm30, %zmm4
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm19 & (zmm4 ^ zmm2))
-; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vporq %ymm3, %ymm4, %ymm17
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm3
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
+; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm4
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
+; AVX512-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm3
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm4
+; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm3
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm6
+; AVX512-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm3
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm14
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm15
+; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm8
+; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm21
+; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm24
+; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm8
+; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm3
+; AVX512-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm14
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm8
+; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm25
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm18
+; AVX512-FCP-NEXT: vpor %ymm14, %ymm15, %ymm8
+; AVX512-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
+; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm12
+; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm7
+; AVX512-FCP-NEXT: vpor %ymm7, %ymm14, %ymm7
+; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm7
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm9
+; AVX512-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7
+; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm11
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm7
+; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm14
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm9
+; AVX512-FCP-NEXT: vporq %ymm9, %ymm7, %ymm19
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm9
+; AVX512-FCP-NEXT: vmovdqa64 %ymm10, %ymm29
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm15[2],ymm9[3,4],ymm15[5],ymm9[6,7,8,9],ymm15[10],ymm9[11,12],ymm15[13],ymm9[14,15]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm7
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm17, %zmm15
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm7 ^ (mem & (zmm15 ^ zmm7))
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm15 & ymm22)
+; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm6
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5
+; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm7 & ymm10)
+; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm15, %ymm7
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,4,0,0,0,5,0,0]
+; AVX512-FCP-NEXT: vpermd %ymm9, %ymm15, %ymm15
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 ^ (ymm10 & (ymm15 ^ ymm7))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm17
+; AVX512-FCP-NEXT: vprold $16, %ymm6, %ymm7
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm10 & (ymm7 ^ ymm15))
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[0,1,2,3]
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm5
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8,9,10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15]
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm6 = [151522058,0,421010202,421010202]
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,2,2,3,10,9,11,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm2
+; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [218894094,0,488382238,488382238]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm0
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm7
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,2,2,3,8,10,10,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm7
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm2 ^ (zmm20 & (zmm7 ^ zmm2))
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [5,0,0,0,6,0,0,6]
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2
+; AVX512-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm2
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7]
-; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm4
-; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-FCP-NEXT: vpbroadcastd 72(%rax), %ymm4
-; AVX512-FCP-NEXT: vpandn %ymm4, %ymm6, %ymm4
-; AVX512-FCP-NEXT: vmovdqa 64(%rax), %ymm5
-; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-FCP-NEXT: vpbroadcastd 72(%rax), %ymm2
+; AVX512-FCP-NEXT: vpandnq %ymm2, %ymm16, %ymm9
+; AVX512-FCP-NEXT: vmovdqa 64(%rax), %ymm2
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm13
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm9
+; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm13
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm9
+; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm28
+; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm10
+; AVX512-FCP-NEXT: vpandnq %ymm10, %ymm16, %ymm10
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9
+; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm9
+; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm10
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6,7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13,14,15]
+; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm10
+; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm8
+; AVX512-FCP-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm9
+; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm31
+; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6,7,8],ymm9[9],ymm13[10,11],ymm9[12],ymm13[13,14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm13
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7,8,9],ymm15[10],ymm13[11,12],ymm15[13],ymm13[14,15]
+; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm10 ^ (zmm20 & (zmm3 ^ zmm10))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm23
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512-FCP-NEXT: vprold $16, %ymm14, %ymm10
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7,8,9],ymm10[10],ymm13[11,12],ymm10[13],ymm13[14,15]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm10
+; AVX512-FCP-NEXT: vmovdqa64 (%rax), %zmm5
+; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [12,13,10,10,14,14,14,14]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [14,15,11,11,15,15,14,15]
+; AVX512-FCP-NEXT: vpermd %zmm5, %zmm3, %zmm9
+; AVX512-FCP-NEXT: vpermd %zmm5, %zmm4, %zmm15
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm27
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm17 & (zmm27 ^ zmm10))
+; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm3
+; AVX512-FCP-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vprold $16, %ymm21, %ymm9
+; AVX512-FCP-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm24[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15]
+; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm15
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm24[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7,8,9,10],ymm12[11],ymm15[12,13],ymm12[14],ymm15[15]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,3,3,10,9,11,10]
+; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm12
+; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
+; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512-FCP-NEXT: vpermd %ymm28, %ymm17, %ymm10
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm10, %zmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm22 & (zmm3 ^ zmm12))
+; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm4
+; AVX512-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm10
+; AVX512-FCP-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm18[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm3
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm12
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm18[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
+; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
+; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm10
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm30[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7,8,9],ymm15[10],ymm10[11,12],ymm15[13],ymm10[14,15]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm29
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm15[2],ymm8[3,4],ymm15[5],ymm8[6,7,8,9],ymm15[10],ymm8[11,12],ymm15[13],ymm8[14,15]
+; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm13, %zmm4
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm12 ^ (zmm8 & (zmm4 ^ zmm12))
; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpbroadcastd 8(%rax), %ymm4
-; AVX512-FCP-NEXT: vpandn %ymm4, %ymm6, %ymm4
-; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm5
-; AVX512-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm10
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8,9,10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,0,3,10,10,11,11]
+; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm19, %zmm4
; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm4
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,0,3,10,10,11,11]
-; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm20, %zmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm4
-; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm11
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm5
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13,14,15]
-; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5
-; AVX512-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm7
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm3
-; AVX512-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm4
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm19 & (zmm0 ^ zmm5))
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512-FCP-NEXT: vprold $16, %ymm9, %ymm4
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm4
-; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm4))
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm19
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm15
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm18
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm3 ^ (mem & (zmm6 ^ zmm3))
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm12
-; AVX512-FCP-NEXT: vprold $16, %ymm25, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm23
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm12, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm28
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10]
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm3
-; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
-; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vpermd 64(%rax), %zmm20, %zmm1
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm16 & (zmm1 ^ zmm3))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6))
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm5
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6,7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13,14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm12, %ymm10
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
+; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm10
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm10 ^ (zmm8 & (zmm1 ^ zmm10))
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512-FCP-NEXT: vprold $16, %ymm10, %ymm0
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7,8,9],ymm0[10],ymm5[11,12],ymm0[13],ymm5[14,15]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm5
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[0,0,2,1,4,4,6,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7,8,9,10],ymm8[11],ymm5[12,13],ymm8[14],ymm5[15]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 64(%rax), %zmm20
+; AVX512-FCP-NEXT: vpermd %zmm20, %zmm17, %zmm21
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm22 & (zmm21 ^ zmm5))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (mem & (zmm21 ^ zmm1))
+; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm0
+; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm1
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm17
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,2,3,8,9,9,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm5
+; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [2,1,3,3,8,8,9,9]
+; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm9
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm3 ^ (zmm23 & (zmm9 ^ zmm3))
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,2,3,8,8,8,9]
+; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm3
+; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm4
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm12
+; AVX512-FCP-NEXT: vmovdqa64 %xmm8, %xmm16
+; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm18
+; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm13, %zmm11
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [6,7,3,3,7,7,6,7]
+; AVX512-FCP-NEXT: vpermd %ymm2, %ymm12, %ymm2
+; AVX512-FCP-NEXT: vpbroadcastd 96(%rax), %ymm12
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm30
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm11))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (mem & (zmm30 ^ zmm9))
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512-FCP-NEXT: vprold $16, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,2,1,8,8,9,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm5
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,1,1,8,8,10,9]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (mem & (zmm1 ^ zmm5))
; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm0
-; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm3
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm1
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7]
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,1,3,8,8,9,9]
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0
-; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
-; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm7
-; AVX512-FCP-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill
-; AVX512-FCP-NEXT: vprold $16, %xmm7, %xmm1
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,8,8,10,9]
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm1 & (zmm3 ^ zmm0))
+; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm2
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm6
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,1,3,8,8,9,9]
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm0
+; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm2
+; AVX512-FCP-NEXT: vprold $16, %xmm2, %xmm3
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm8
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm1
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm0
-; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm6
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; AVX512-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm6
+; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm3
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm31
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm3
+; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm4
; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,8,9,9,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0
-; AVX512-FCP-NEXT: vpbroadcastd 64(%rax), %ymm6
-; AVX512-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm29
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm21 & (zmm29 ^ zmm0))
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,1,8,9,9,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm18, %zmm0
+; AVX512-FCP-NEXT: vpbroadcastd 64(%rax), %ymm3
+; AVX512-FCP-NEXT: vpbroadcastd 68(%rax), %ymm5
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm17
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm22 & (zmm17 ^ zmm0))
; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm0 & (zmm29 ^ zmm3))
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm8
-; AVX512-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm10
-; AVX512-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6],xmm3[7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm6
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9
-; AVX512-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm10
-; AVX512-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vprold $16, %xmm10, %xmm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm31
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm8
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm6 ^ (zmm1 & (zmm8 ^ zmm6))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm0 & (zmm17 ^ zmm1))
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm5
+; AVX512-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm12
+; AVX512-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm1
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm9
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm3
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm6
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm12
+; AVX512-FCP-NEXT: vprold $16, %xmm12, %xmm1
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm26
+; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm25
+; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm5
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm2 & (zmm5 ^ zmm3))
; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm1
-; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm3
+; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm19
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm2
+; AVX512-FCP-NEXT: vmovdqa %xmm13, %xmm10
; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm27
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1
-; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm3
-; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm26
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm21 & (zmm26 ^ zmm1))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm0 & (zmm26 ^ zmm8))
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm0
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm1
-; AVX512-FCP-NEXT: vmovdqa %ymm9, %ymm11
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm22[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15]
-; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[2,2,2,2,6,6,6,6]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm0
-; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm10
-; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm13
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,8,9,9,11]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8,9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1))
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512-FCP-NEXT: vprold $16, %ymm4, %ymm1
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[1,2,2,3,5,6,6,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm17
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[0,0,2,1,4,4,6,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10]
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm4
-; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm9
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9]
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm8
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm30 & (zmm8 ^ zmm5))
-; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512-FCP-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512-FCP-NEXT: vpermd (%rax), %zmm20, %zmm20
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm3))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm0))
-; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm1
-; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0
-; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm0
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm1
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm11
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm24
-; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm15
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,8,8,9]
-; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm5
-; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm3
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm21
-; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm14
-; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm7
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm0 ^ (zmm30 & (zmm15 ^ zmm0))
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [6,7,3,3,7,7,6,7]
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm27, %ymm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpbroadcastd 96(%rax), %ymm23
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm23
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm30 & (zmm23 ^ zmm7))
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm31 & (zmm23 ^ zmm8))
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; AVX512-FCP-NEXT: vprold $16, %xmm4, %xmm0
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7]
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm4
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[3,3,3,3,7,7,7,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
-; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm7
-; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm4
-; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm8
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm8
-; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm16
-; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,2,1,8,8,9,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm0
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
-; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} xmm18 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm8
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm10
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm10[1],xmm13[2,3],xmm10[4],xmm13[5,6],xmm10[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm13
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (zmm28 & (zmm13 ^ zmm0))
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm30 & (ymm14 ^ ymm0))
-; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm8
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm6
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,1,3,8,8,9,9]
-; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm6
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm10
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm10[0,0,1,1]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm10
-; AVX512-FCP-NEXT: vmovdqa %xmm8, %xmm5
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm10[0,0,1,1]
-; AVX512-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm27, %ymm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm17
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm10, %zmm10
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm30 & (zmm10 ^ zmm1))
-; AVX512-FCP-NEXT: vpbroadcastd 100(%rax), %ymm1
-; AVX512-FCP-NEXT: vpbroadcastd 104(%rax), %ymm17
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm1, %zmm19
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm17 & (zmm19 ^ zmm6))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm31 & (zmm10 ^ zmm15))
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm23
+; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm1
+; AVX512-FCP-NEXT: vpbroadcastd (%rax), %ymm2
+; AVX512-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm14
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm22 & (zmm14 ^ zmm1))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm5))
+; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm6
+; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm5
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm18 & (ymm13 ^ ymm7))
+; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm7, %ymm11
+; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm2[3,3,3,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,2,2,2]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,2,3,8,9,9,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512-FCP-NEXT: vmovdqa %xmm8, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm15, %xmm0
+; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm1
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm7
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm7
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm10
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm22
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,1,3,3,8,8,9,9]
+; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (mem & (zmm1 ^ zmm2))
+; AVX512-FCP-NEXT: vpbroadcastd 100(%rax), %ymm2
+; AVX512-FCP-NEXT: vpbroadcastd 104(%rax), %ymm8
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm29
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm16 & (zmm29 ^ zmm7))
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm8
+; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm2
+; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm15
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9]
+; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [6,7,3,3,7,7,6,7]
+; AVX512-FCP-NEXT: vpermd %ymm28, %ymm4, %ymm15
+; AVX512-FCP-NEXT: vpbroadcastd 32(%rax), %ymm28
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm15, %zmm28
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm18 & (zmm28 ^ zmm0))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (mem & (zmm28 ^ zmm1))
; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm1 & (zmm19 ^ zmm13))
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX512-FCP-NEXT: vprold $16, %xmm11, %xmm11
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm11
-; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm6
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1],xmm13[2,3],xmm6[4],xmm13[5,6],xmm6[7]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm6
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm11 ^ (zmm28 & (zmm6 ^ zmm11))
-; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm5
-; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm7
-; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5
-; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm5
-; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm4
-; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm7
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm17 & (zmm4 ^ zmm5))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm6))
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm29 = zmm29 ^ (zmm1 & (zmm29 ^ mem))
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7]
+; AVX512-FCP-NEXT: vprold $16, %xmm10, %xmm10
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3,4],xmm10[5],xmm3[6,7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,1,8,8,9,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,1,1,8,8,10,9]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm5
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm15 & (zmm5 ^ zmm3))
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
+; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm8
+; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm3
+; AVX512-FCP-NEXT: vpbroadcastd 36(%rax), %ymm0
+; AVX512-FCP-NEXT: vpbroadcastd 40(%rax), %ymm4
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm16 & (zmm0 ^ zmm3))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm5))
; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm1 & (ymm0 ^ mem))
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm1 & (ymm3 ^ ymm0))
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm5 & (ymm9 ^ ymm14))
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm3[0,1,2,3]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0))
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm0 = (zmm0 & zmm5) | mem
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm2 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm2 = (zmm2 & zmm5) | mem
-; AVX512-FCP-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
+; AVX512-FCP-NEXT: vpternlogd {{.*#+}} ymm11 = mem ^ (ymm1 & (ymm11 ^ mem))
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm3 & (ymm12 ^ ymm11))
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm11 & (ymm9 ^ ymm13))
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm9[0,1,2,3],zmm12[0,1,2,3]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [12,13,10,10,14,14,14,14]
+; AVX512-FCP-NEXT: vpermd %zmm20, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [14,15,11,11,15,15,14,15]
+; AVX512-FCP-NEXT: vpermd %zmm20, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4))
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm4 ^ (mem & (zmm13 ^ zmm4))
+; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm5
+; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm6
+; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm5
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm7))
; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm9 = mem[1,1,1,1,5,5,5,5]
-; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512-FCP-NEXT: vpermd %ymm11, %ymm9, %ymm9
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512-FCP-NEXT: vpandn %ymm9, %ymm13, %ymm9
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm9
-; AVX512-FCP-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX512-FCP-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
+; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15]
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm10 = mem[1,1,1,1,5,5,5,5]
+; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax)
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm2 = (zmm2 & zmm11) | mem
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX512-FCP-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3]
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13))
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm11 ^ (zmm28 & (zmm3 ^ zmm11))
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm8 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm8 = (zmm8 & zmm11) | mem
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload
; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm28 & (zmm6 ^ zmm5))
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm5 & (zmm3 ^ zmm0))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm5 & (zmm6 ^ zmm2))
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm10 ^ (zmm15 & (zmm6 ^ zmm10))
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm15 & (zmm5 ^ zmm4))
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm4 & (zmm6 ^ zmm2))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm8 ^ (zmm4 & (zmm5 ^ zmm8))
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (mem & (zmm2 ^ zmm0))
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm0 = zmm0 | (zmm1 & mem)
-; AVX512-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm9 = zmm9 | (zmm1 & mem)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm2))
-; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 128(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm26, (%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 704(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 640(%rax)
-; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512-FCP-NEXT: vmovaps %zmm1, 576(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 832(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax)
-; AVX512-FCP-NEXT: addq $1576, %rsp # imm = 0x628
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [11,0,0,11,0,0,0,12]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (mem & (zmm4 ^ zmm2))
+; AVX512-FCP-NEXT: vpermd %zmm20, %zmm7, %zmm2
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [8,9,12,13,12,13,13,15]
+; AVX512-FCP-NEXT: vpermd %zmm20, %zmm8, %zmm9
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm2 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ mem))
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512-FCP-NEXT: vpermd %zmm9, %zmm7, %zmm7
+; AVX512-FCP-NEXT: vpermd %zmm9, %zmm8, %zmm8
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
+; AVX512-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm7 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ mem))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm4))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 320(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 256(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 192(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 704(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 640(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 576(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 384(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 512(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 832(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax)
+; AVX512-FCP-NEXT: addq $1288, %rsp # imm = 0x508
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
; AVX512DQ-LABEL: store_i16_stride7_vf64:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: subq $2840, %rsp # imm = 0xB18
-; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm6
-; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm13
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm7
-; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm8
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm6, %ymm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm13, %ymm3
-; AVX512DQ-NEXT: vporq %ymm2, %ymm3, %ymm17
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm8, %ymm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm7, %ymm3
-; AVX512DQ-NEXT: vporq %ymm2, %ymm3, %ymm18
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm3
-; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm3, %ymm2
-; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm26
-; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm4
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm3
-; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm27
-; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm3, %ymm2
-; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm23
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm4, %ymm3
-; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm22
-; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%r9), %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovdqa (%r8), %ymm3
-; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm3
-; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm2, %ymm2
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm3
-; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm4
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm2
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm5
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm5, %ymm0
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm1
-; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm1, %ymm9
-; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm0
-; AVX512DQ-NEXT: vpshufb %ymm12, %ymm0, %ymm10
-; AVX512DQ-NEXT: vpor %ymm10, %ymm9, %ymm9
-; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512DQ-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm10, %ymm6, %ymm9
-; AVX512DQ-NEXT: vmovdqa64 %ymm10, %ymm31
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6,7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13,14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm8[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
-; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm9
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[3,3,3,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm9[2,1,2,3,6,5,6,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm10[2,2,2,2]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4],ymm6[5],ymm10[6,7,8,9],ymm6[10],ymm10[11,12],ymm6[13],ymm10[14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
-; AVX512DQ-NEXT: vprold $16, %ymm9, %ymm8
-; AVX512DQ-NEXT: vpshufb %ymm12, %ymm9, %ymm9
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm17, %zmm6
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (mem & (zmm7 ^ zmm6))
-; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm6
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,u,u,u,u],zero,zero
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ymm11)
-; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 | (ymm6 & ymm11)
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm7, %ymm6
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm10[2,1,3,3]
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm10 & (ymm7 ^ ymm6))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm8[2,2,2,2]
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (ymm8 & (ymm6 ^ ymm7))
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,5,0,0,0,6,0,0,6,0,0,0,7,0,0,7]
-; AVX512DQ-NEXT: vmovdqa 96(%rax), %ymm6
-; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm7
-; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm6, %ymm6
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX512DQ-NEXT: vpandn %ymm7, %ymm12, %ymm7
-; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm19
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-NEXT: vpbroadcastd 72(%rax), %ymm6
-; AVX512DQ-NEXT: vpandnq %ymm6, %ymm28, %ymm6
-; AVX512DQ-NEXT: vmovdqa 64(%rax), %ymm7
-; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm7, %ymm7
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 64(%r9), %xmm7
-; AVX512DQ-NEXT: vmovdqa 64(%r8), %xmm8
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm17
-; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm20
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-NEXT: vpshufb %xmm15, %xmm6, %xmm6
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm9
-; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm7
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm14
-; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm10
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm8
-; AVX512DQ-NEXT: vpandnq %ymm8, %ymm28, %ymm8
-; AVX512DQ-NEXT: vmovdqa (%rax), %ymm12
-; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm12, %ymm13
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm13, %zmm8, %zmm8
-; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%r9), %xmm6
-; AVX512DQ-NEXT: vmovdqa (%r8), %xmm12
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm29
-; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm24
-; AVX512DQ-NEXT: vpshufb %xmm15, %xmm8, %xmm8
-; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm25
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
-; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm8
-; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm13
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,0,2,1]
-; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm6
-; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm12
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm21
-; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm16
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,0,1,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2]
-; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm5[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm15
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm15, %ymm11
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm15[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,2,2,3]
-; AVX512DQ-NEXT: vpandnq %ymm12, %ymm19, %ymm12
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm11, %zmm11
-; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm0[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7,8,9,10],ymm12[11],ymm11[12,13],ymm12[14],ymm11[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,3,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm6
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm4, %ymm11
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT: subq $2648, %rsp # imm = 0xA58
+; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm3
+; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm2
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm1
+; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm0
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm3, %ymm4
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm5
+; AVX512DQ-NEXT: vporq %ymm4, %ymm5, %ymm22
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm12, %ymm0, %ymm4
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
+; AVX512DQ-NEXT: vpshufb %ymm14, %ymm1, %ymm5
+; AVX512DQ-NEXT: vporq %ymm4, %ymm5, %ymm23
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm4
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm4, %ymm4
+; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm5
+; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm5
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm5, %ymm4
+; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm29
+; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm6
+; AVX512DQ-NEXT: vpshufb %ymm9, %ymm6, %ymm5
+; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm31
+; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm5
+; AVX512DQ-NEXT: vpshufb %ymm12, %ymm5, %ymm4
+; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm20
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm6
+; AVX512DQ-NEXT: vpshufb %ymm14, %ymm6, %ymm5
+; AVX512DQ-NEXT: vmovdqa64 %ymm6, %ymm21
+; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa (%r9), %ymm4
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm4, %ymm4
+; AVX512DQ-NEXT: vmovdqa (%r8), %ymm5
+; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm5, %ymm5
+; AVX512DQ-NEXT: vmovdqa64 %ymm11, %ymm27
+; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm4
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm4, %ymm4
+; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm5
+; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufb %ymm9, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufb %ymm12, %ymm4, %ymm4
+; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm5
+; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufb %ymm14, %ymm5, %ymm5
+; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm11
+; AVX512DQ-NEXT: vpshufb %ymm8, %ymm11, %ymm4
+; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm8
+; AVX512DQ-NEXT: vpshufb %ymm9, %ymm8, %ymm5
+; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm10
+; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm4
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm12
+; AVX512DQ-NEXT: vpshufb %ymm14, %ymm12, %ymm5
+; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm15
+; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm4
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm15, %ymm4
+; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm13
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm13, %ymm5
+; AVX512DQ-NEXT: vpor %ymm5, %ymm4, %ymm4
+; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-NEXT: # ymm14 = mem[0,1,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm14, %ymm3, %ymm4
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6,7,8],ymm4[9],ymm2[10,11],ymm4[12],ymm2[13,14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm5[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vprold $16, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm4
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm27 = ymm5[3,3,3,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[2,1,2,3,6,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm5[2,2,2,2]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7,8,9],ymm3[10],ymm2[11,12],ymm3[13],ymm2[14,15]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
-; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT: vprold $16, %ymm4, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm7, %ymm4, %ymm3
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm22, %zmm2
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm23, %zmm0
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (mem & (zmm0 ^ zmm2))
+; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm2
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm2[16,17,u,u,u,u],zero,zero
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 | (ymm0 & ymm5)
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 | (ymm2 & ymm30)
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm4[2,1,3,3]
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm4 & (ymm2 ^ ymm0))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm1[2,2,2,2]
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm26 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (ymm26 & (ymm0 ^ ymm2))
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,2,3],zmm0[0,1,2,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-NEXT: vpbroadcastd 72(%rax), %ymm0
+; AVX512DQ-NEXT: vpandn %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6
+; AVX512DQ-NEXT: vmovdqa 64(%rax), %ymm2
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm16
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa 64(%r9), %xmm1
+; AVX512DQ-NEXT: vmovdqa 64(%r8), %xmm2
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm22
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm19
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm5
+; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm7
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,0,2,1]
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm0
+; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm2
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm17
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,0,1,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa (%rax), %ymm0
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpermd %zmm15, %zmm18, %zmm0
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpbroadcastd 8(%rax), %ymm4
+; AVX512DQ-NEXT: vpandn %ymm4, %ymm6, %ymm4
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm12
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[1,1,1,1,5,5,5,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT: vmovdqa (%r9), %xmm0
+; AVX512DQ-NEXT: vmovdqa (%r8), %xmm1
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm23
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24
+; AVX512DQ-NEXT: vpshufb %xmm9, %xmm3, %xmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm25
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm3[0,0,1,1]
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm4
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm6[0,0,2,1]
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm6
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm18
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm9[0,0,1,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa %ymm14, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm14, %ymm11, %ymm9
+; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm9[0,2,2,3]
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm2
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm9[0,2,2,3]
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufb %ymm6, %ymm2, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm3
-; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm11
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm27[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3],ymm14[4,5],ymm9[6],ymm14[7,8,9,10],ymm9[11],ymm14[12,13],ymm9[14],ymm14[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,1,3,3]
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; AVX512DQ-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vprold $16, %ymm13, %ymm9
+; AVX512DQ-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7,8,9],ymm9[10],ymm14[11,12],ymm9[13],ymm14[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,1,3,2]
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vprold $16, %ymm5, %ymm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm13[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm15[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3]
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm12
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm12[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm9[2,2,2,3]
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 96(%rcx), %xmm0
-; AVX512DQ-NEXT: vmovdqa 96(%rdx), %xmm1
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
-; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm2
-; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm12
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
-; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vprold $16, %xmm2, %xmm4
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,2,3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
-; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm6 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm0, %xmm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
-; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm12[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5,6,7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13,14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm9[0,2,2,3]
+; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm8
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7,8,9],ymm9[10],ymm14[11,12],ymm9[13],ymm14[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2]
+; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm8, %ymm9
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm29
+; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm10
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm14[2],ymm9[3,4],ymm14[5],ymm9[6,7,8,9],ymm14[10],ymm9[11,12],ymm14[13],ymm9[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,2,2,3]
+; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Reload
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm20[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3],ymm9[4,5],ymm14[6],ymm9[7,8,9,10],ymm14[11],ymm9[12,13],ymm14[14],ymm9[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,2,3,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vprold $16, %ymm8, %ymm9
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm20[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0,1],ymm9[2],ymm14[3,4],ymm9[5],ymm14[6,7,8,9],ymm9[10],ymm14[11,12],ymm9[13],ymm14[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm9[2,1,3,2]
+; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm31[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm14 = ymm10[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm14[1],ymm9[2,3],ymm14[4],ymm9[5,6,7,8],ymm14[9],ymm9[10,11],ymm14[12],ymm9[13,14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm9[0,2,2,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa 96(%rcx), %xmm9
+; AVX512DQ-NEXT: vmovdqa 96(%rdx), %xmm14
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,0,1,1]
+; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7,8,9,10],ymm12[11],ymm15[12,13],ymm12[14],ymm15[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm12[2,1,3,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm12
+; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm15
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm10
+; AVX512DQ-NEXT: vpshufb %xmm10, %xmm11, %xmm11
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm11[0,0,1,1]
+; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm20[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm8[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1],ymm11[2],ymm13[3,4],ymm11[5],ymm13[6,7,8,9],ymm11[10],ymm13[11,12],ymm11[13],ymm13[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm11[2,2,2,3]
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vprold $16, %xmm12, %xmm11
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm15[1,1,2,3]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm11[2],xmm13[3,4],xmm11[5],xmm13[6,7]
+; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa 96(%r9), %xmm0
-; AVX512DQ-NEXT: vmovdqa 96(%r8), %xmm1
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm18, %zmm0
-; AVX512DQ-NEXT: vpbroadcastd 100(%rax), %ymm1
-; AVX512DQ-NEXT: vpbroadcastd 104(%rax), %ymm2
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm31
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm0))
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3]
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm9, %xmm1
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
-; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm2, %zmm1
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
-; AVX512DQ-NEXT: vprold $16, %xmm10, %xmm3
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,1,2,3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm1 ^ (zmm0 & (zmm14 ^ zmm1))
-; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm3
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
-; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm4, %zmm1
-; AVX512DQ-NEXT: vpbroadcastd 64(%rax), %ymm3
-; AVX512DQ-NEXT: vpbroadcastd 68(%rax), %ymm5
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm25
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm19 & (zmm25 ^ zmm1))
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3]
-; AVX512DQ-NEXT: vpshufb %xmm6, %xmm8, %xmm3
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2,3],xmm3[4],xmm5[5,6],xmm3[7]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm2, %zmm3
-; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm5
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm2
+; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm11
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6],xmm11[7]
+; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa 96(%r9), %xmm11
+; AVX512DQ-NEXT: vmovdqa 96(%r8), %xmm12
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,1,1,3]
+; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[2,3,3,3,6,7,7,7]
+; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm14[4],xmm9[4],xmm14[5],xmm9[5],xmm14[6],xmm9[6],xmm14[7],xmm9[7]
+; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm9 = xmm13[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm8, %xmm11
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [16,18,19,19,19,19,0,0,0,1,0,1,2,3,2,3]
+; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm8, %zmm11
+; AVX512DQ-NEXT: vpbroadcastd 100(%rax), %ymm9
+; AVX512DQ-NEXT: vpbroadcastd 104(%rax), %ymm12
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm9, %zmm21
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm26 & (zmm21 ^ zmm11))
+; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm25
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm12
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm5, %xmm0
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [16,16,17,17,17,17,0,0,0,1,0,1,2,3,2,3]
+; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm9, %zmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm5
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm1, %xmm1
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX512DQ-NEXT: vprold $16, %xmm21, %xmm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm16[1,1,2,3]
+; AVX512DQ-NEXT: vprold $16, %xmm2, %xmm2
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm17[1,1,2,3]
; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4],xmm2[5],xmm5[6,7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm5
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm0 & (zmm5 ^ zmm3))
-; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm2
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (zmm1 & (zmm2 ^ zmm0))
+; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm5
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm11 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,0,0]
+; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm11, %zmm0
+; AVX512DQ-NEXT: vpbroadcastd 64(%rax), %ymm5
+; AVX512DQ-NEXT: vpbroadcastd 68(%rax), %ymm10
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm5, %zmm20
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm30 & (zmm20 ^ zmm0))
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm3, %xmm3
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm9, %zmm3
+; AVX512DQ-NEXT: vmovdqa64 %xmm18, %xmm5
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX512DQ-NEXT: vprold $16, %xmm6, %xmm4
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm18[1,1,2,3]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2],xmm5[3,4],xmm4[5],xmm5[6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm10
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm3 ^ (zmm1 & (zmm10 ^ zmm3))
+; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm0
; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,7,6]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm4, %zmm0
+; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm11, %zmm0
; AVX512DQ-NEXT: vpbroadcastd (%rax), %ymm1
-; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm2
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm20
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm19 & (zmm20 ^ zmm0))
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT: vpbroadcastd 4(%rax), %ymm3
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm31
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm30 & (zmm31 ^ zmm0))
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[1,1,1,1,5,5,5,5]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm11[0,1,2,3,5,4,6,7,8,9,10,11,13,12,14,15]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[2,2,2,2,6,6,6,6]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm6[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[0,1,1,3,4,5,5,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,3,2]
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm9, %ymm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vmovdqa %ymm3, %ymm11
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vmovdqa64 %ymm29, %ymm0
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm6, %ymm0
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm7[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm0[2,2,2,3]
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
-; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7,8,9,10],ymm2[11],ymm0[12,13],ymm2[14],ymm0[15]
-; AVX512DQ-NEXT: vprold $16, %ymm7, %ymm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[1,2,2,3,5,6,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7,8,9],ymm0[10],ymm2[11,12],ymm0[13],ymm2[14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm11[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm10[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,2,2,6,6,6,6]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7,8,9,10],ymm6[11],ymm0[12,13],ymm6[14],ymm0[15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,2,3,6,6,6,7]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
-; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm1[2,2,2,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm3[2,2,3,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm4[2,1,3,2]
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm9
-; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm10
-; AVX512DQ-NEXT: vprold $16, %xmm10, %xmm1
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,2,3]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm2[0,2,2,3]
-; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm1
-; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm2
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm2, %xmm3
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,5,7,6]
-; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm27
-; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm1
-; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm2
-; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm4
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm13
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm1))
-; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm7
+; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7,8,9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,2,3,3]
+; AVX512DQ-NEXT: vprold $16, %ymm4, %ymm0
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[1,2,2,3,5,6,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7,8,9],ymm0[10],ymm1[11,12],ymm0[13],ymm1[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,1,3,2]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,3,6,6,6,7]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm0[0,2,2,3]
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm13
+; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm12
+; AVX512DQ-NEXT: vprold $16, %xmm12, %xmm0
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[1,1,2,3]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
+; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm0
+; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm1
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
+; AVX512DQ-NEXT: vmovdqa %xmm14, %xmm4
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm1, %xmm14
+; AVX512DQ-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,4,5,7,6]
+; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm8, %zmm0
+; AVX512DQ-NEXT: vpbroadcastd 36(%rax), %ymm1
+; AVX512DQ-NEXT: vpbroadcastd 40(%rax), %ymm6
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm30
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm25 & (zmm30 ^ zmm0))
+; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm11
; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm6
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,0,1,1]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm0[2,1,3,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm3[0,0,1,1]
+; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm18 = xmm8[0,0,1,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm19 = ymm3[2,1,3,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,0,1,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm5[2,2,2,3]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[0,1,3,2,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm22 = xmm3[0,1,1,3]
+; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm23 = mem[2,3,3,3,6,7,7,7]
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm4 & (zmm1 ^ zmm3))
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm8 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm8 & (zmm7 ^ zmm3))
; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm3 = mem[2,1,3,2]
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm29 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm29 & (ymm3 ^ ymm1))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm30 = ymm30 ^ (ymm28 & (ymm30 ^ ymm3))
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm4 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm4 & (ymm3 ^ ymm7))
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm28 = ymm28 ^ (ymm16 & (ymm28 ^ ymm3))
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm3 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm3 = (zmm3 & zmm28) | mem
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm5 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm5 = (zmm5 & zmm16) | mem
+; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm16 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm16 = (zmm16 & zmm0) | mem
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm28, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm0 = (zmm0 & zmm28) | mem
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm23 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm18 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm23 ^ (zmm28 & (zmm18 ^ zmm23))
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm3 ^ (zmm2 & (zmm18 ^ zmm3))
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm23 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm3 ^ (zmm28 & (zmm23 ^ zmm3))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm23 = zmm0 ^ (zmm2 & (zmm23 ^ zmm0))
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm17 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm25 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm0 ^ (zmm25 & (zmm17 ^ zmm0))
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm5 ^ (zmm1 & (zmm17 ^ zmm5))
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm17 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm0 ^ (mem & (zmm17 ^ zmm0))
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm0
-; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm19 & (ymm0 ^ mem))
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm5 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm0 ^ (zmm25 & (zmm5 ^ zmm0))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm16 ^ (zmm1 & (zmm5 ^ zmm16))
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm7, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm1 & (ymm0 ^ mem))
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm2 = ymm2 ^ (ymm1 & (ymm2 ^ ymm0))
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm19 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm19 = zmm19 | (zmm1 & mem)
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm27 = ymm27 ^ (ymm1 & (ymm27 ^ ymm0))
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm28[0,1,2,3],zmm27[0,1,2,3]
+; AVX512DQ-NEXT: vmovdqa64 64(%rax), %zmm16
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,13,0,0,0,14,0,0,14,0,0,0,15,0,0,15]
+; AVX512DQ-NEXT: vpermd %zmm16, %zmm1, %zmm28
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm3 & (zmm28 ^ zmm0))
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT: vshufi64x2 $68, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm0 = zmm0[0,1,2,3],mem[0,1,2,3]
-; AVX512DQ-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm0 = (zmm0 & zmm1) | mem
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm30[0,1,2,3],zmm2[0,1,2,3]
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm1))
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm2 & (zmm11 ^ zmm1))
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm1 & (zmm25 ^ zmm14))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm1 & (zmm20 ^ zmm5))
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm1 ^ (zmm5 & (zmm2 ^ zmm1))
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vmovdqa64 (%rax), %zmm27
+; AVX512DQ-NEXT: vpermd %zmm27, %zmm1, %zmm7
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm0))
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (zmm0 & (zmm20 ^ zmm2))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm0 & (zmm31 ^ zmm10))
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm22, %zmm3, %zmm22
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm5 & (zmm22 ^ zmm1))
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
-; AVX512DQ-NEXT: vpermd 64(%rax), %zmm14, %zmm5
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm29, %zmm3, %zmm29
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm29 = zmm0 ^ (zmm2 & (zmm29 ^ zmm0))
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,0,4,0,0,0,5,0,0,5,0,0,0,6,0,0]
+; AVX512DQ-NEXT: vpermd %zmm16, %zmm10, %zmm2
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm3 & (zmm5 ^ zmm1))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm24, %zmm26, %zmm1
-; AVX512DQ-NEXT: vpermd (%rax), %zmm14, %zmm14
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm3 & (zmm14 ^ zmm1))
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm1 & (zmm5 ^ zmm2))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm1 & (zmm14 ^ zmm22))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ zmm0))
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm15, %zmm0
+; AVX512DQ-NEXT: vpermd %zmm27, %zmm10, %zmm10
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm3 & (zmm10 ^ zmm0))
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm0 & (zmm2 ^ zmm1))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm0 & (zmm10 ^ zmm29))
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm8 & (zmm9 ^ zmm0))
+; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3]
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm4 & (zmm2 ^ zmm1))
-; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm8 & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm18[0,1,1,3]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm26, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm19, %zmm18
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm0 ^ (zmm8 & (zmm18 ^ zmm0))
+; AVX512DQ-NEXT: vpermq $64, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm0 = mem[0,0,0,1]
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm22 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm1 ^ (zmm4 & (zmm22 ^ zmm1))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm12[0,1,1,3]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm21, %zmm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm8, %zmm15, %zmm8
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm1 ^ (zmm4 & (zmm8 ^ zmm1))
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[1,1,2,2]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm9[0],xmm4[1],xmm9[2,3],xmm4[4],xmm9[5,6],xmm4[7]
-; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm7 = mem[2,2,2,3]
-; AVX512DQ-NEXT: vpshuflw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm9 = mem[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,1,3]
-; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm10 = mem[2,3,3,3,6,7,7,7]
-; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm16 = mem[0,0,2,1]
-; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm12 = mem[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,0,1,3]
-; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm21 = mem[0,0,1,1]
-; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; AVX512DQ-NEXT: # xmm15 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,0,2,1]
-; AVX512DQ-NEXT: vpermq $234, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm24 = mem[2,2,2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm3
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,2,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,1,3]
-; AVX512DQ-NEXT: vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm26 = mem[2,3,3,3,6,7,7,7]
-; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm27 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm27 = mem[0,0,2,1]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0
+; AVX512DQ-NEXT: vpermq $182, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm8 = mem[2,1,3,2]
+; AVX512DQ-NEXT: vpbroadcastd 96(%rax), %ymm14
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm8, %zmm14
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm4 & (zmm14 ^ zmm0))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm22[0,0,0,1]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm24, %zmm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm23[2,1,3,2]
+; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm19
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm19, %zmm8, %zmm8
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm4 & (zmm8 ^ zmm0))
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm1))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (zmm0 & (zmm8 ^ zmm18))
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm6[1,1,2,2]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm12[0],xmm1[1],xmm12[2,3],xmm1[4],xmm12[5,6],xmm1[7]
+; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm12 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7,8,9],ymm12[10],ymm11[11,12],ymm12[13],ymm11[14,15]
+; AVX512DQ-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm12 = mem[1,1,1,1,5,5,5,5]
+; AVX512DQ-NEXT: vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm13 = mem[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7,8,9],ymm12[10],ymm13[11,12],ymm12[13],ymm13[14,15]
+; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm13 = mem[0,0,2,1,4,4,6,5]
+; AVX512DQ-NEXT: vpshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm15 = mem[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,0,0,0,4,4,4,4]
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3],ymm15[4,5],ymm13[6],ymm15[7,8,9,10],ymm13[11],ymm15[12,13],ymm13[14],ymm15[15]
+; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm15 = mem[0,0,2,1]
+; AVX512DQ-NEXT: vpshuflw $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm4 = mem[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,3]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm15, %zmm4
+; AVX512DQ-NEXT: vpermq $80, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm15 = mem[0,0,1,1]
+; AVX512DQ-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX512DQ-NEXT: # xmm3 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
+; AVX512DQ-NEXT: vpermq $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm18 = mem[0,0,2,1]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,4]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,3,3,4,5,6,7]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm7, %zmm7
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm10[2,1,3,2]
-; AVX512DQ-NEXT: vpbroadcastd 96(%rax), %ymm10
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm29 & (zmm9 ^ zmm7))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm26[2,1,3,2]
-; AVX512DQ-NEXT: vpbroadcastd 32(%rax), %ymm10
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm7, %zmm7
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm29 & (zmm7 ^ zmm3))
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (zmm3 & (zmm9 ^ zmm22))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ zmm8))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm16, %zmm3
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm8
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm3 ^ (zmm28 & (zmm8 ^ zmm3))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm27, %zmm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm3
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm1 ^ (zmm28 & (zmm3 ^ zmm1))
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm31 ^ (zmm1 & (zmm31 ^ zmm8))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm1 & (zmm13 ^ zmm3))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm17))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (mem & (zmm11 ^ zmm2))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,3,3]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm15, %zmm3
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm4 ^ (zmm25 & (zmm3 ^ zmm4))
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm18, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm25 & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm0 & (zmm21 ^ zmm3))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm0 & (zmm30 ^ zmm1))
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm1, %zmm1
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0))
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [11,0,0,11,0,0,0,12,8,9,12,13,12,13,13,15]
+; AVX512DQ-NEXT: vpermd %zmm16, %zmm0, %zmm3
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm6 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm3 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm3 = zmm3 ^ (zmm6 & (zmm3 ^ mem))
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm13[0,1,2,3]
+; AVX512DQ-NEXT: vpermd %zmm27, %zmm0, %zmm0
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm6 & (zmm0 ^ zmm4))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm9))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm13, 256(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm7, 192(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm14, 128(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm25, 448(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm31, 704(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm9, 640(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm5, 576(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm18, 512(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm11, 384(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm19, 768(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm30, 832(%rax)
-; AVX512DQ-NEXT: addq $2840, %rsp # imm = 0xB18
+; AVX512DQ-NEXT: vmovdqa64 %zmm30, 256(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm8, 192(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm31, (%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm20, 448(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm21, 704(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm14, 640(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm2, 576(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm17, 512(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm3, 768(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm28, 832(%rax)
+; AVX512DQ-NEXT: addq $2648, %rsp # imm = 0xA58
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
; AVX512DQ-FCP-LABEL: store_i16_stride7_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $1576, %rsp # imm = 0x628
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm1
-; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm1, %ymm16
+; AVX512DQ-FCP-NEXT: subq $1288, %rsp # imm = 0x508
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm13
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm2
+; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm2, %ymm16
; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm13
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm26
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm28
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm27
-; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm29
-; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm15
-; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm15
-; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm15, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm13
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm9
-; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm9
-; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm8
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm8, %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm11
-; AVX512DQ-FCP-NEXT: vporq %ymm11, %ymm10, %ymm20
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7,8,9],ymm10[10],ymm12[11,12],ymm10[13],ymm12[14,15]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,1,3,2]
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm14, %ymm17
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm3[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm14[2],ymm12[3,4],ymm14[5],ymm12[6,7,8,9],ymm14[10],ymm12[11,12],ymm14[13],ymm12[14,15]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,3]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm16, %zmm10
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm7, %zmm12
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm10 ^ (mem & (zmm12 ^ zmm10))
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm14 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[16,17,u,u,u,u],zero,zero
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 | (ymm12 & ymm14)
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm10 & ymm16)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm21
-; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm10
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [0,4,0,0,0,5,0,0]
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm12
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm16 & (ymm12 ^ ymm10))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm18
-; AVX512DQ-FCP-NEXT: vprold $16, %ymm11, %ymm10
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,2]
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm10 = ymm10 ^ (ymm16 & (ymm10 ^ ymm12))
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,1,4,5,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm6
-; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm10, %ymm10
-; AVX512DQ-FCP-NEXT: vpandn %ymm10, %ymm14, %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm16
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm12
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm12, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm10
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm3[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7,8,9,10],ymm10[11],ymm12[12,13],ymm10[14],ymm12[15]
-; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm12 = [151522058,0,421010202,421010202]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm24
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [0,2,2,3,10,9,11,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm3, %zmm2
-; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm12 = [218894094,0,488382238,488382238]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm10
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm22
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm5[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
-; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm4, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7,8,9],ymm5[10],ymm4[11,12],ymm5[13],ymm4[14,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm30 = [2,2,2,3,8,10,10,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm30, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (zmm19 & (zmm4 ^ zmm2))
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vporq %ymm3, %ymm4, %ymm17
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm4
+; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm6
+; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm15
+; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm21
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm24
+; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, (%rsp) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm3, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
+; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm25
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm18
+; AVX512DQ-FCP-NEXT: vpor %ymm14, %ymm15, %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm23
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm12, %ymm7
+; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm14, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm9
+; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm9, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm11
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm14
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm14, %ymm9
+; AVX512DQ-FCP-NEXT: vporq %ymm9, %ymm7, %ymm19
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm10, %ymm29
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm0[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm15[2],ymm9[3,4],ymm15[5],ymm9[6,7,8,9],ymm15[10],ymm9[11,12],ymm15[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm16, %zmm7
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm17, %zmm15
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm7 ^ (mem & (zmm15 ^ zmm7))
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm9
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm9[u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[14,15,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[16,17,u,u,u,u],zero,zero
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm15 & ymm22)
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm5 = ymm5 | (ymm7 & ymm10)
+; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm15, %ymm7
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,4,0,0,0,5,0,0]
+; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm15, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm15 = ymm15 ^ (ymm10 & (ymm15 ^ ymm7))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm17
+; AVX512DQ-FCP-NEXT: vprold $16, %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (ymm10 & (ymm7 ^ ymm15))
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm7[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm2, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1,2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8,9,10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15]
+; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm6 = [151522058,0,421010202,421010202]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6,7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13,14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm15 = [0,2,2,3,10,9,11,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm15, %zmm2
+; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} ymm5 = [218894094,0,488382238,488382238]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5,6,7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13,14,15]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27,22,23,26,27,0,0,24,25,26,27,0,0,26,27,26,27]
+; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,2,2,3,8,10,10,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm2 ^ (zmm20 & (zmm7 ^ zmm2))
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [5,0,0,0,6,0,0,6]
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm2
+; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [6,5,0,0,7,6,0,7,6,5,0,0,7,6,0,7]
-; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm2, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT: vpbroadcastd 72(%rax), %ymm4
-; AVX512DQ-FCP-NEXT: vpandn %ymm4, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT: vpbroadcastd 72(%rax), %ymm2
+; AVX512DQ-FCP-NEXT: vpandnq %ymm2, %ymm16, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [12,13,128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm13
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm9, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm13
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm13, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm28
+; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm10
+; AVX512DQ-FCP-NEXT: vpandnq %ymm10, %ymm16, %ymm10
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm10, %zmm9
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm4[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6,7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13,14,15]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm15, %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm8
+; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm23, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm31
+; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm12[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6,7,8],ymm9[9],ymm13[10,11],ymm9[12],ymm13[13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm13
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm12[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7,8,9],ymm15[10],ymm13[11,12],ymm15[13],ymm13[14,15]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm10 ^ (zmm20 & (zmm3 ^ zmm10))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm23
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT: vprold $16, %ymm14, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm11[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7,8,9],ymm10[10],ymm13[11,12],ymm10[13],ymm13[14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm13, %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 (%rax), %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [12,13,10,10,14,14,14,14]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [14,15,11,11,15,15,14,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm3, %zmm9
+; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm4, %zmm15
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm27
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (zmm17 & (zmm27 ^ zmm10))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm21, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vprold $16, %ymm21, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm24, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm24[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15]
+; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm9 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm15
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm24[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0,1,2],ymm12[3],ymm15[4,5],ymm12[6],ymm15[7,8,9,10],ymm12[11],ymm15[12,13],ymm12[14],ymm15[15]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [2,2,3,3,10,9,11,10]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm16, %zmm12
+; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
+; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm17, %ymm10
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm10, %zmm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (zmm22 & (zmm3 ^ zmm12))
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm18, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm18[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3],ymm10[4],ymm12[5,6,7,8],ymm10[9],ymm12[10,11],ymm10[12],ymm12[13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm12
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm18[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4],ymm15[5],ymm12[6,7,8,9],ymm15[10],ymm12[11,12],ymm15[13],ymm12[14,15]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
+; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm30[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7,8,9],ymm15[10],ymm10[11,12],ymm15[13],ymm10[14,15]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm29
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1],ymm15[2],ymm8[3,4],ymm15[5],ymm8[6,7,8,9],ymm15[10],ymm8[11,12],ymm15[13],ymm8[14,15]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm13, %zmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm8 = [0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm12 ^ (zmm8 & (zmm4 ^ zmm12))
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rax), %ymm4
-; AVX512DQ-FCP-NEXT: vpandn %ymm4, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7,8,9,10],ymm11[11],ymm10[12,13],ymm11[14],ymm10[15]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,1,0,3,10,10,11,11]
+; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm19, %zmm4
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpbroadcastd {{.*#+}} ymm10 = [18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,1,0,3,10,10,11,11]
-; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm20, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8,9,10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13,14,15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm15, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm15, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7,8,9],ymm6[10],ymm4[11,12],ymm6[13],ymm4[14,15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm19 & (zmm0 ^ zmm5))
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT: vprold $16, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,1,3,2,10,10,10,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm4
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm18 & (zmm0 ^ zmm4))
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm19
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm15
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm18
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm29[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm27[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7,8,9],ymm4[10],ymm5[11,12],ymm4[13],ymm5[14,15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm13, %zmm6
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm3 ^ (mem & (zmm6 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm12
-; AVX512DQ-FCP-NEXT: vprold $16, %ymm25, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm23
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm26[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm12, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm28
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm26[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5],ymm5[6],ymm3[7,8,9,10],ymm5[11],ymm3[12,13],ymm5[14],ymm3[15]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [2,2,3,3,10,9,11,10]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm1, %zmm3
-; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [0,5,4,0,0,6,5,0,0,5,4,0,0,6,5,0]
-; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vpermd 64(%rax), %zmm20, %zmm1
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm16 & (zmm1 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6))
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm14[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6,7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm12, %ymm10
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm14[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7,8,9],ymm11[10],ymm10[11,12],ymm11[13],ymm10[14,15]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm10
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[2,2,2,2,6,6,6,6]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7,8,9],ymm1[10],ymm0[11,12],ymm1[13],ymm0[14,15]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm4[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm6[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7,8,9],ymm1[10],ymm5[11,12],ymm1[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm13, %zmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm10 ^ (zmm8 & (zmm1 ^ zmm10))
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vprold $16, %ymm10, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[1,2,2,3,5,6,6,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7,8,9],ymm0[10],ymm5[11,12],ymm0[13],ymm5[14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm11[0,0,2,1,4,4,6,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm8[3],ymm5[4,5],ymm8[6],ymm5[7,8,9,10],ymm8[11],ymm5[12,13],ymm8[14],ymm5[15]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm16, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rax), %zmm20
+; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm17, %zmm21
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (zmm22 & (zmm21 ^ zmm5))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm21 ^ (mem & (zmm21 ^ zmm1))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm1
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm17
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,2,2,3,8,9,9,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8,9,10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm12 = [2,1,3,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm12, %zmm9
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm3 ^ (zmm23 & (zmm9 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm11[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm13 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm4
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm8, %xmm12
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm8, %xmm16
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm18
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm13, %zmm11
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [6,7,3,3,7,7,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm12, %ymm2
+; AVX512DQ-FCP-NEXT: vpbroadcastd 96(%rax), %ymm12
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm2, %zmm30
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (zmm2 & (zmm30 ^ zmm11))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm11
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm30 = zmm30 ^ (mem & (zmm30 ^ zmm9))
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4],xmm5[5],xmm6[6,7]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,0,2,1,8,8,9,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1],xmm1[2,3],xmm0[4],xmm1[5,6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm10 = [0,0,1,1,8,8,10,9]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm5 ^ (mem & (zmm1 ^ zmm5))
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm27[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6,7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm3
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm9 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm6[0],xmm1[1],xmm6[2,3],xmm1[4],xmm6[5,6],xmm1[7]
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm16 = [0,1,1,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm16, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm7, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm25 = [0,0,1,1,8,8,10,9]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm1 & (zmm3 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,1,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm24, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm2
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm2, %xmm3
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,2,3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2],xmm4[3,4],xmm3[5],xmm4[6,7]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm2 = [65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm2 & (zmm1 ^ zmm0))
; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm6
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm0, %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm3
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm31
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm0, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm4
; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm7 = [0,0,0,1,8,9,9,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0
-; AVX512DQ-FCP-NEXT: vpbroadcastd 64(%rax), %ymm6
-; AVX512DQ-FCP-NEXT: vpbroadcastd 68(%rax), %ymm8
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm6, %zmm29
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm21 & (zmm29 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm18 = [0,0,0,1,8,9,9,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm18, %zmm0
+; AVX512DQ-FCP-NEXT: vpbroadcastd 64(%rax), %ymm3
+; AVX512DQ-FCP-NEXT: vpbroadcastd 68(%rax), %ymm5
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm3, %zmm17
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm22 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm22 & (zmm17 ^ zmm0))
; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm0 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm0 & (zmm29 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm8, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6],xmm3[7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm16, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm10, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[1,1,2,3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2],xmm8[3,4],xmm3[5],xmm8[6,7]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm31
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm8
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm6 ^ (zmm1 & (zmm8 ^ zmm6))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm0 & (zmm17 ^ zmm1))
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm12
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm5, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3],xmm1[4],xmm3[5,6],xmm1[7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm9
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm24, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm12
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm12, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm26
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm25
+; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm5
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm2 & (zmm5 ^ zmm3))
; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm1, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm19
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm1, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, %xmm10
; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm27
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm7, %zmm1
-; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm3
-; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm6
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm26
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm21 & (zmm26 ^ zmm1))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm26 = zmm26 ^ (zmm0 & (zmm26 ^ zmm8))
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm22[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6,7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13,14,15]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm15, %ymm9, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, %ymm11
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm22[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7,8,9],ymm3[10],ymm1[11,12],ymm3[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm1
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[2,2,2,2,6,6,6,6]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7,8,9],ymm3[10],ymm0[11,12],ymm3[13],ymm0[14,15]
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1],ymm0[2],ymm6[3,4],ymm0[5],ymm6[6,7,8,9],ymm0[10],ymm6[11,12],ymm0[13],ymm6[14,15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm13, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm13
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,2,2,3,8,9,9,11]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm5
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2],ymm3[3],ymm7[4,5],ymm3[6],ymm7[7,8,9,10],ymm3[11],ymm7[12,13],ymm3[14],ymm7[15]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm1 ^ (mem & (zmm0 ^ zmm1))
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vprold $16, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm16[1,2,2,3,5,6,6,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7,8,9],ymm1[10],ymm3[11,12],ymm1[13],ymm3[14,15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,18,19,20,21,18,19,20,21]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm17
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[0,0,2,1,4,4,6,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8,9,10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm2 = [2,2,3,3,10,9,11,10]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm9
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,1,3,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm5 ^ (zmm30 & (zmm8 ^ zmm5))
-; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm18 = [26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
-; AVX512DQ-FCP-NEXT: # ymm18 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm23[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT: vpermd (%rax), %zmm20, %zmm20
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm20 = zmm20 ^ (mem & (zmm20 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm15[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm22[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm11
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm24
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm19, %zmm15
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm19 = [2,2,2,3,8,8,8,9]
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm3
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm21
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm9, %xmm14
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm19, %zmm7
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm0 ^ (zmm30 & (zmm15 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [6,7,3,3,7,7,6,7]
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm27, %ymm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpbroadcastd 96(%rax), %ymm23
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm0, %zmm23
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm30 = [65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm30 & (zmm23 ^ zmm7))
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm31 = [0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm23 ^ (zmm31 & (zmm23 ^ zmm8))
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm4, %xmm0
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[1,1,2,3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3,4],xmm0[5],xmm4[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm16[3,3,3,3,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm4
-; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm16
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm19, %zmm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm22 = [0,0,2,1,8,8,9,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm0
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
-; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} xmm18 = [6,7,4,5,0,0,8,9,6,7,4,5,0,0,8,9]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm10[1],xmm13[2,3],xmm10[4],xmm13[5,6],xmm10[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm25, %zmm13
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm28 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (zmm28 & (zmm13 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm14 = ymm14 ^ (ymm30 & (ymm14 ^ ymm0))
-; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm9, %xmm6
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm24 = [0,1,1,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm24, %zmm6
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm2, %xmm10
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm10[0,0,1,1]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm10, %xmm10
-; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, %xmm5
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm10[0,0,1,1]
-; AVX512DQ-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm27, %ymm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm17
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm10, %zmm10
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm30 & (zmm10 ^ zmm1))
-; AVX512DQ-FCP-NEXT: vpbroadcastd 100(%rax), %ymm1
-; AVX512DQ-FCP-NEXT: vpbroadcastd 104(%rax), %ymm17
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm1, %zmm19
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm17 & (zmm19 ^ zmm6))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm31 & (zmm10 ^ zmm15))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm23
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm18, %zmm1
+; AVX512DQ-FCP-NEXT: vpbroadcastd (%rax), %ymm2
+; AVX512DQ-FCP-NEXT: vpbroadcastd 4(%rax), %ymm3
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm14
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm22 & (zmm14 ^ zmm1))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm0 & (zmm14 ^ zmm5))
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm5
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm18
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm13 = ymm13 ^ (ymm18 & (ymm13 ^ ymm7))
+; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm7, %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm2[3,3,3,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,2,2,2]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm1 = [0,2,2,3,8,9,9,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm15, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm1
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm1, %xmm7
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm7
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
+; AVX512DQ-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm1 = mem[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8,9,10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm10
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm22
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm0 = [2,1,3,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 ^ (mem & (zmm1 ^ zmm2))
+; AVX512DQ-FCP-NEXT: vpbroadcastd 100(%rax), %ymm2
+; AVX512DQ-FCP-NEXT: vpbroadcastd 104(%rax), %ymm8
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm29
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (zmm16 & (zmm29 ^ zmm7))
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,26,27,28,29,26,27,28,29,30,31,30,31]
+; AVX512DQ-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm7 = mem[3,3,3,3,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7,8,9],ymm7[10],ymm2[11,12],ymm7[13],ymm2[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm8
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm2
+; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm15
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [2,2,2,3,8,8,8,9]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm4, %zmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [6,7,3,3,7,7,6,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm4, %ymm15
+; AVX512DQ-FCP-NEXT: vpbroadcastd 32(%rax), %ymm28
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm28, %zmm15, %zmm28
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (zmm18 & (zmm28 ^ zmm0))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm28 ^ (mem & (zmm28 ^ zmm1))
; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm19 = zmm19 ^ (zmm1 & (zmm19 ^ zmm13))
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; AVX512DQ-FCP-NEXT: vprold $16, %xmm11, %xmm11
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,2,3]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm22, %zmm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm12, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,2]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0],xmm6[1],xmm13[2,3],xmm6[4],xmm13[5,6],xmm6[7]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm25, %zmm6
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm11 ^ (zmm28 & (zmm6 ^ zmm11))
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm7
-; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm24, %zmm5
-; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm4
-; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm7
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm4
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm17 & (zmm4 ^ zmm5))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm1 & (zmm4 ^ zmm6))
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm29 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm29 = zmm29 ^ (zmm1 & (zmm29 ^ mem))
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm10[4],xmm3[4],xmm10[5],xmm3[5],xmm10[6],xmm3[6],xmm10[7],xmm3[7]
+; AVX512DQ-FCP-NEXT: vprold $16, %xmm10, %xmm10
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm10[2],xmm3[3,4],xmm10[5],xmm3[6,7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,2,1,8,8,9,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm6[u,u,4,5,u,u,u,u,6,7,u,u,u,u,8,9]
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm0[1],xmm5[2,3],xmm0[4],xmm5[5,6],xmm0[7]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,1,1,8,8,10,9]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm4, %zmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535,65535,0,0,65535,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm15 & (zmm5 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm4, %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm24, %zmm3
+; AVX512DQ-FCP-NEXT: vpbroadcastd 36(%rax), %ymm0
+; AVX512DQ-FCP-NEXT: vpbroadcastd 40(%rax), %ymm4
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm16 & (zmm0 ^ zmm3))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm1 & (zmm0 ^ zmm5))
; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm0 = mem ^ (ymm1 & (ymm0 ^ mem))
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (ymm1 & (ymm3 ^ ymm0))
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm5 & (ymm9 ^ ymm14))
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm9[0,1,2,3],zmm3[0,1,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm0 = (zmm0 & zmm5) | mem
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm2 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm2 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm2 = (zmm2 & zmm5) | mem
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
+; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} ymm11 = mem ^ (ymm1 & (ymm11 ^ mem))
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm12 = ymm12 ^ (ymm3 & (ymm12 ^ ymm11))
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm11 = [65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (ymm11 & (ymm9 ^ ymm13))
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm9[0,1,2,3],zmm12[0,1,2,3]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [12,13,10,10,14,14,14,14]
+; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [14,15,11,11,15,15,14,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm4))
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm4 ^ (mem & (zmm13 ^ zmm4))
+; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm6
+; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm5, %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm27 = zmm27 ^ (mem & (zmm27 ^ zmm7))
; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm7[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
-; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7,8,9],ymm7[10],ymm8[11,12],ymm7[13],ymm8[14,15]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
-; AVX512DQ-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm9 = mem[1,1,1,1,5,5,5,5]
-; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7,8,9],ymm9[10],ymm8[11,12],ymm9[13],ymm8[14,15]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,1,4,5,4,5,5,7]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm9, %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm11[12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[16,17],zero,zero
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm13 = [65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535]
-; AVX512DQ-FCP-NEXT: vpandn %ymm9, %ymm13, %ymm9
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm11, %zmm9
-; AVX512DQ-FCP-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # xmm11 = mem[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,3,3,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1]
+; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4],ymm7[5],ymm9[6,7,8,9],ymm7[10],ymm9[11,12],ymm7[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,20,21,24,25,26,27,22,23,22,23]
+; AVX512DQ-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm10 = mem[1,1,1,1,5,5,5,5]
+; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4],ymm10[5],ymm9[6,7,8,9],ymm10[10],ymm9[11,12],ymm10[13],ymm9[14,15]
+; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax)
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm2, %zmm2 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm2 = (zmm2 & zmm11) | mem
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm8, %zmm8 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpshuflw $248, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # xmm10 = mem[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,2,1]
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,3]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,3,3,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,3]
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,3,2]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm13))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm11 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm3 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm11 ^ (zmm28 & (zmm3 ^ zmm11))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
+; AVX512DQ-FCP-NEXT: vpternlogq $234, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm8 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm8 = (zmm8 & zmm11) | mem
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 32-byte Folded Reload
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm6, %zmm6 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm5 ^ (zmm28 & (zmm6 ^ zmm5))
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm0 ^ (zmm5 & (zmm3 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm5 & (zmm6 ^ zmm2))
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm0
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm10 ^ (zmm15 & (zmm6 ^ zmm10))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm4 ^ (zmm15 & (zmm5 ^ zmm4))
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm2 ^ (zmm4 & (zmm6 ^ zmm2))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm8 ^ (zmm4 & (zmm5 ^ zmm8))
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm2, %zmm2
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm0 ^ (mem & (zmm2 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm0 = zmm0 | (zmm1 & mem)
-; AVX512DQ-FCP-NEXT: vpternlogq $248, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm9 = zmm9 | (zmm1 & mem)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm2))
-; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 320(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 256(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 128(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, (%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 704(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 640(%rax)
-; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vmovaps %zmm1, 576(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 512(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 832(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 768(%rax)
-; AVX512DQ-FCP-NEXT: addq $1576, %rsp # imm = 0x628
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [11,0,0,11,0,0,0,12]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm2 ^ (mem & (zmm4 ^ zmm2))
+; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm7, %zmm2
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [8,9,12,13,12,13,13,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm20, %zmm8, %zmm9
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm2, %zmm2
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm2 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm2 = zmm2 ^ (zmm3 & (zmm2 ^ mem))
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpermd %zmm9, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT: vpermd %zmm9, %zmm8, %zmm8
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm3, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm7 = zmm7 ^ (zmm3 & (zmm7 ^ mem))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm4))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 320(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 256(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 192(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 704(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 640(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 576(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 384(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 512(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 832(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 768(%rax)
+; AVX512DQ-FCP-NEXT: addq $1288, %rsp # imm = 0x508
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index 78b07e5671e5a..9c8928c40d75c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -1047,7 +1047,8 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero
+; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,5]
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm13, %ymm13
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm13
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7]
@@ -2016,159 +2017,162 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP: # %bb.0:
; AVX2-FCP-NEXT: subq $232, %rsp
; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm12
-; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm1
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm15
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm15[2],xmm12[2],xmm15[3],xmm12[3]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm2
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
+; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm8
-; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm6
-; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,2,2,3]
+; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm7
+; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3]
; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm9
; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm10
; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,3]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm0
-; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm4
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,2,2,3]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[1,2,2,3]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm1
+; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm5
+; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[1,2,2,3]
; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[1,2,2,3]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 36(%r9), %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd 32(%rcx), %xmm3
-; AVX2-FCP-NEXT: vpbroadcastd 32(%rdx), %xmm5
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm10
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7]
+; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 36(%r9), %ymm6
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd 32(%rcx), %xmm4
+; AVX2-FCP-NEXT: vpbroadcastd 32(%rdx), %xmm6
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
+; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm13
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm11
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastq %xmm4, %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 32(%r9), %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastq %xmm5, %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 32(%r9), %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm14
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm13[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm15
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm3[2],ymm15[2],ymm3[3],ymm15[3],ymm3[6],ymm15[6],ymm3[7],ymm15[7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm14
+; AVX2-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [4,5]
+; AVX2-FCP-NEXT: vpermd %ymm14, %ymm7, %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vpbroadcastd 52(%r9), %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm2
; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm3
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm12[0],xmm15[1],xmm12[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm1
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm5
-; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm1
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm3[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm13
-; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm4
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm3[4,5],ymm15[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm15
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2],ymm15[3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm7, %ymm7
-; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm8
-; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [2,2,3,3,2,2,3,3]
-; AVX2-FCP-NEXT: # ymm12 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermd %ymm8, %ymm12, %ymm9
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3,4,5],ymm9[6,7]
-; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm9
-; AVX2-FCP-NEXT: vpermd %ymm9, %ymm12, %ymm15
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm15[1],ymm7[2,3,4,5,6],ymm15[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[2,3],ymm15[2,3]
-; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm15
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7]
-; AVX2-FCP-NEXT: vpermd %ymm15, %ymm6, %ymm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm2[2,3,4,5],ymm7[6,7]
-; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm7
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [4,6,2,3,4,6,6,7]
-; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm6
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm6
-; AVX2-FCP-NEXT: vpermd %ymm15, %ymm12, %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm6[2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm6
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4,5,6],ymm6[7]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm7[0],ymm14[0],ymm7[1],ymm14[1],ymm7[4],ymm14[4],ymm7[5],ymm14[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm15[4,5],ymm6[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 48(%r9), %ymm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm7 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm7[2,3]
+; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm2
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm10
+; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm6
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm7
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm12
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm8, %ymm7
+; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3]
+; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm9, %ymm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7]
+; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm8
+; AVX2-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm12
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4,5,6],ymm12[7]
+; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm4[2,3],ymm12[2,3]
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7]
-; AVX2-FCP-NEXT: vpermd %ymm8, %ymm7, %ymm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7]
-; AVX2-FCP-NEXT: vpermd %ymm9, %ymm0, %ymm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[4],ymm1[4],ymm5[5],ymm1[5]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpermd %ymm14, %ymm7, %ymm7
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7]
+; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm7
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [4,6,2,3,4,6,6,7]
+; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm12
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4,5,6],ymm12[7]
+; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm12 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm12 = xmm4[2],mem[2],xmm4[3],mem[3]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm4, %ymm12
+; AVX2-FCP-NEXT: vpermd %ymm14, %ymm9, %ymm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm7
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4,5,6],ymm7[7]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm7 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5]
+; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[4],ymm15[4],ymm9[5],ymm15[5]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,2,2]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,1,2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm14[4,5],ymm7[6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 48(%r9), %ymm9
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm9 = ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[6],ymm2[6],ymm5[7],ymm2[7]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm9[2,3]
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [6,5,3,3,6,5,7,7]
+; AVX2-FCP-NEXT: vpermd %ymm0, %ymm9, %ymm9
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm3[2,3,4,5],ymm9[6,7]
+; AVX2-FCP-NEXT: vpermd %ymm8, %ymm1, %ymm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2,3,4,5,6],ymm8[7]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm2 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm1
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax)
; AVX2-FCP-NEXT: vmovdqa %ymm3, 160(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm6, 288(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm2, 256(%rax)
+; AVX2-FCP-NEXT: vmovdqa %ymm7, 288(%rax)
+; AVX2-FCP-NEXT: vmovdqa %ymm4, 256(%rax)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm0, 352(%rax)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4389,338 +4393,344 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX2-FCP-LABEL: store_i32_stride6_vf32:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: subq $872, %rsp # imm = 0x368
-; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm2
-; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm4
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7
+; AVX2-FCP-NEXT: subq $936, %rsp # imm = 0x3A8
+; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm3
+; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm9
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm8
; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm8[2],xmm3[2],xmm8[3],xmm3[3]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm9
-; AVX2-FCP-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill
+; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm2
+; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm7
+; AVX2-FCP-NEXT: vmovdqa %xmm7, (%rsp) # 16-byte Spill
; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %xmm6
; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
-; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm5
-; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,2,2,3]
+; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm4
+; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm10
; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm8
-; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm13
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm4
+; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm2
+; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm5
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,2,2,3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[1,2,2,3]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm1[2],xmm9[2],xmm1[3],xmm9[3]
+; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,2,2,3]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[1,2,2,3]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm5
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm13[0],zero,xmm13[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 36(%r9), %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 36(%r9), %ymm5
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %xmm5
+; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,2,2,3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm6
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm9
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovdqa 64(%r8), %xmm14
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm14[0],zero,xmm14[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 68(%r9), %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm7
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovdqa 64(%r8), %xmm13
+; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm13[0],zero,xmm13[1],zero
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 68(%r9), %ymm5
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm15
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,2,2,3]
-; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm0
+; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %xmm5
+; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm0[0,1,2,1]
-; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm3
-; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm0
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm11
+; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm15
+; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm5[2],xmm15[2],xmm5[3],xmm15[3]
+; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5],ymm10[6,7]
-; AVX2-FCP-NEXT: vmovdqa 96(%r8), %xmm11
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm11[0],zero,xmm11[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 100(%r9), %ymm12
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2],ymm12[3],ymm10[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm10
-; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm12
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm10[2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastq %xmm8, %ymm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm7
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm7
-; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm5
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm10
+; AVX2-FCP-NEXT: vmovdqa 96(%r8), %xmm14
+; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm14[0],zero,xmm14[1],zero
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 100(%r9), %ymm11
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm11[3],ymm10[4,5,6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm10[2],ymm2[3],ymm10[3],ymm2[6],ymm10[6],ymm2[7],ymm10[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5],ymm8[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm12 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm12
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm12[3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd 32(%rcx), %xmm8
-; AVX2-FCP-NEXT: vpbroadcastd 32(%rdx), %xmm12
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm10
+; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm11
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm10[2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastq %xmm4, %ymm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm12
+; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm11
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3]
+; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm4
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm8
+; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[6],ymm8[6],ymm4[7],ymm8[7]
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT: vmovaps (%r8), %ymm4
+; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [4,u,5,u]
+; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm8
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 20(%r9), %ymm8
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd 32(%rcx), %xmm3
+; AVX2-FCP-NEXT: vpbroadcastd 32(%rdx), %xmm8
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastq %xmm13, %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 32(%r9), %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm10
-; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm8
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm8[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm4
-; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 52(%r9), %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 32(%r9), %ymm2
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm10
+; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm9
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm10[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
+; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm2
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm3
+; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm3
+; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm2
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 52(%r9), %ymm2
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpbroadcastd 64(%rcx), %xmm1
-; AVX2-FCP-NEXT: vpbroadcastd 64(%rdx), %xmm4
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastq %xmm14, %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 64(%r9), %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %ymm6
-; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %ymm4
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm4[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm6[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm1[1],ymm9[2],ymm1[3],ymm9[4],ymm1[5],ymm9[6],ymm1[7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm12
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd 64(%rdx), %xmm2
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastq %xmm13, %ymm2
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm9 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 84(%r9), %ymm9
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 64(%r9), %ymm2
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd %xmm15, %xmm1
-; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm8
+; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm7
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
+; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm6
+; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[6],ymm6[6],ymm2[7],ymm6[7]
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
+; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm2
+; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm6
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 84(%r9), %ymm6
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm6[3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm15[0],xmm5[1],xmm15[1]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,2,1]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastq %xmm11, %ymm1
+; AVX2-FCP-NEXT: vpbroadcastq %xmm14, %ymm1
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
; AVX2-FCP-NEXT: vpbroadcastd 96(%r9), %ymm1
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm3
-; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm2
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm3[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2],ymm0[3],ymm9[4],ymm0[5],ymm9[6],ymm0[7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm9
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm13 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 116(%r9), %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm6
+; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm5
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1
+; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm13
+; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm13[2],ymm1[3],ymm13[3],ymm1[6],ymm13[6],ymm1[7],ymm13[7]
+; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovaps 96(%r8), %ymm1
+; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm13 = [4,u,5,u]
+; AVX2-FCP-NEXT: vpermps %ymm1, %ymm13, %ymm15
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 116(%r9), %ymm15
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm13
-; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0
-; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3]
-; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7]
-; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm14
-; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm11
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4,5,6],ymm11[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm11 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm5 = mem[2,3],ymm5[2,3]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [6,5,3,3,6,5,7,7]
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm0
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [4,6,2,3,4,6,6,7]
-; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckhdq (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm11, %ymm5
-; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm11
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3,4,5],ymm13[6,7]
-; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm14
-; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm13[1],ymm5[2,3,4,5,6],ymm13[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm5 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm10[0],ymm8[0],ymm10[1],ymm8[1],ymm10[4],ymm8[4],ymm10[5],ymm8[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5],ymm5[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 48(%r9), %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1,2,3,4],ymm13[5],ymm5[6,7]
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm10[2],ymm8[2],ymm10[3],ymm8[3],ymm10[6],ymm8[6],ymm10[7],ymm8[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm15
+; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [2,2,3,3,2,2,3,3]
+; AVX2-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3,4,5],ymm14[6,7]
+; AVX2-FCP-NEXT: vmovaps (%r9), %ymm15
+; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm13
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm13 = ymm13[0],mem[0],ymm13[1],mem[1],ymm13[4],mem[4],ymm13[5],mem[5]
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[4],ymm11[4],ymm12[5],ymm11[5]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm4[4,5],ymm13[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 16(%r9), %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
+; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[6],ymm11[6],ymm12[7],ymm11[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm12 = mem[2,3],ymm11[2,3]
+; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm11 = [6,5,3,3,6,5,7,7]
+; AVX2-FCP-NEXT: vpermps %ymm4, %ymm11, %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm12 = [4,6,2,3,4,6,6,7]
+; AVX2-FCP-NEXT: vpermps %ymm15, %ymm12, %ymm13
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpckhps (%rsp), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm13, %ymm4
+; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm13
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm4[2,3,4,5],ymm13[6,7]
+; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm13
+; AVX2-FCP-NEXT: vpermps %ymm13, %ymm0, %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm14[1],ymm4[2,3,4,5,6],ymm14[7]
+; AVX2-FCP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5]
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[4],ymm9[4],ymm10[5],ymm9[5]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm14[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 48(%r9), %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5],ymm4[6,7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm9 = mem[2,3],ymm9[2,3]
+; AVX2-FCP-NEXT: vpermps %ymm3, %ymm11, %ymm3
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3,4,5],ymm3[6,7]
+; AVX2-FCP-NEXT: vpermps %ymm13, %ymm12, %ymm9
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm9[1],ymm3[2,3,4,5,6],ymm9[7]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9
+; AVX2-FCP-NEXT: vpermps %ymm2, %ymm0, %ymm10
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7]
+; AVX2-FCP-NEXT: vmovaps 64(%r9), %ymm13
+; AVX2-FCP-NEXT: vpermps %ymm13, %ymm0, %ymm10
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6],ymm10[7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5]
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[4],ymm7[4],ymm8[5],ymm7[5]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm2[4,5],ymm10[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 80(%r9), %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm14[5],ymm10[6,7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm7 = mem[2,3],ymm7[2,3]
+; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3,4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vpermps %ymm13, %ymm12, %ymm7
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm7[1],ymm2[2,3,4,5,6],ymm7[7]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7
+; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm8
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7]
+; AVX2-FCP-NEXT: vmovaps 96(%r9), %ymm8
+; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm7 = ymm7[0],mem[0],ymm7[1],mem[1],ymm7[4],mem[4],ymm7[5],mem[5]
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm13[2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 112(%r9), %ymm13
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5],ymm7[6,7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm5 = mem[2,3],ymm5[2,3]
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm7, %ymm8
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7]
-; AVX2-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm8
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm10, %ymm8
-; AVX2-FCP-NEXT: vmovdqa 64(%r8), %ymm10
-; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm11
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT: vmovdqa 64(%r9), %ymm11
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0],ymm14[1],ymm8[2,3,4,5,6],ymm14[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[4],ymm4[4],ymm6[5],ymm4[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm14[2,3],ymm12[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 80(%r9), %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[6],ymm4[6],ymm6[7],ymm4[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm4 = mem[2,3],ymm4[2,3]
-; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm6
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5],ymm6[6,7]
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm15, %ymm6
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4,5,6],ymm6[7]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,3,2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm10, %ymm6
-; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm10
-; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm11
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5],ymm11[6,7]
-; AVX2-FCP-NEXT: vmovdqa 96(%r9), %ymm11
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[2,3,4,5,6],ymm1[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm9 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5],ymm6[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 112(%r9), %ymm9
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5],ymm6[6,7]
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm2 = mem[2,3],ymm2[2,3]
-; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm15, %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7]
+; AVX2-FCP-NEXT: vpermps %ymm1, %ymm11, %ymm1
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT: vpermps %ymm8, %ymm12, %ymm5
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4,5,6],ymm5[7]
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT: vmovdqa %ymm2, 736(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm6, 672(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm1, 640(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm4, 544(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm12, 480(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm8, 448(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm5, 352(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm13, 288(%rax)
-; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm0, 160(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm1, 736(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm7, 672(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm0, 640(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm2, 544(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm10, 480(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm9, 448(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm3, 352(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm4, 288(%rax)
+; AVX2-FCP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovaps %ymm0, 256(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm15, 160(%rax)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -4749,7 +4759,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax)
-; AVX2-FCP-NEXT: addq $872, %rsp # imm = 0x368
+; AVX2-FCP-NEXT: addq $936, %rsp # imm = 0x3A8
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
@@ -9031,33 +9041,33 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
;
; AVX2-FCP-LABEL: store_i32_stride6_vf64:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: subq $2376, %rsp # imm = 0x948
-; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm9
+; AVX2-FCP-NEXT: subq $2600, %rsp # imm = 0xA28
+; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm7
; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %xmm1
; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6
; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm10
; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm2
; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm6
-; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm8
+; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %xmm5
; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,2,2,3]
; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2
; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm8
-; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %xmm9
+; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,3]
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm2
-; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm11
+; AVX2-FCP-NEXT: vmovdqa 32(%r8), %xmm14
; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
; AVX2-FCP-NEXT: vpbroadcastd 4(%r9), %ymm4
@@ -9065,13 +9075,13 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,2,2,3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[1,2,2,3]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[1,2,2,3]
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm11[0],zero,xmm11[1],zero
+; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm14[0],zero,xmm14[1],zero
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
; AVX2-FCP-NEXT: vpbroadcastd 36(%r9), %ymm4
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7]
@@ -9082,657 +9092,671 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,2,2,3]
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
-; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm15
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm13
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm15[2],xmm13[3],xmm15[3]
+; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %xmm11
+; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm11[2],xmm4[3],xmm11[3]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vmovdqa 64(%r8), %xmm4
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 68(%r9), %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT: vmovdqa 64(%r8), %xmm3
+; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 68(%r9), %ymm10
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm10[3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3]
; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
-; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm10
-; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm12
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm5[0,1,2,1]
+; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %xmm15
+; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm5
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm15[2],xmm5[3],xmm15[3]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vmovdqa 96(%r8), %xmm8
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm8[0],zero,xmm8[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 100(%r9), %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm12[4,5],ymm10[6,7]
+; AVX2-FCP-NEXT: vmovdqa 96(%r8), %xmm10
+; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm10[0],zero,xmm10[1],zero
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 100(%r9), %ymm13
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2],ymm13[3],ymm12[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 128(%rcx), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[1,2,2,3]
; AVX2-FCP-NEXT: vmovdqa 128(%rdx), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,2,1]
; AVX2-FCP-NEXT: vmovdqa 128(%rsi), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm1
; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vmovdqa 128(%r8), %xmm5
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 132(%r9), %ymm6
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7]
+; AVX2-FCP-NEXT: vmovdqa 128(%r8), %xmm12
+; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm12[0],zero,xmm12[1],zero
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3],ymm13[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 132(%r9), %ymm13
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2],ymm13[3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 160(%rcx), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3]
; AVX2-FCP-NEXT: vmovdqa 160(%rdx), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,2,2,3]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm13[0],xmm9[0],xmm13[1],xmm9[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,2,1]
; AVX2-FCP-NEXT: vmovdqa 160(%rsi), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm1
-; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm13
+; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm13[2],xmm0[2],xmm13[3],xmm0[3]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vmovdqa 160(%r8), %xmm6
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 164(%r9), %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7]
+; AVX2-FCP-NEXT: vmovdqa 160(%r8), %xmm0
+; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 164(%r9), %ymm9
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa 192(%rcx), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,2,2,3]
; AVX2-FCP-NEXT: vmovdqa 192(%rdx), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1]
; AVX2-FCP-NEXT: vmovdqa 192(%rsi), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm1
; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
; AVX2-FCP-NEXT: vmovdqa 192(%r8), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 196(%r9), %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 224(%rcx), %xmm3
-; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3]
+; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 196(%r9), %ymm9
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovdqa 224(%rcx), %xmm8
+; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,2,2,3]
; AVX2-FCP-NEXT: vmovdqa 224(%rdx), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm0[1,2,2,3]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1]
; AVX2-FCP-NEXT: vmovdqa 224(%rsi), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm1
; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
; AVX2-FCP-NEXT: vmovdqa 224(%r8), %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm0[0],zero,xmm0[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 228(%r9), %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm14[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm3
-; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm14
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
+; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm0[0],zero,xmm0[1],zero
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 228(%r9), %ymm9
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd (%rcx), %xmm8
+; AVX2-FCP-NEXT: vpbroadcastd (%rdx), %xmm9
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2,3],ymm1[4,5,6,7]
; AVX2-FCP-NEXT: vpbroadcastq %xmm2, %ymm0
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
; AVX2-FCP-NEXT: vpbroadcastd (%r9), %ymm1
; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm14
-; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm0
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm14[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2
+; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm1
+; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm0
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3]
+; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1
+; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm2
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
+; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovaps (%r8), %ymm1
+; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm0 = [4,u,5,u]
+; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm8
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 20(%r9), %ymm8
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm8[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd 32(%rcx), %xmm2
+; AVX2-FCP-NEXT: vpbroadcastd 32(%rdx), %xmm8
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
+; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm8 = xmm6[0],mem[0],xmm6[1],mem[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1],ymm2[2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastq %xmm14, %ymm7
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 32(%r9), %ymm7
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5],ymm2[6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 20(%r9), %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd 32(%rcx), %xmm0
-; AVX2-FCP-NEXT: vpbroadcastd 32(%rdx), %xmm1
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastq %xmm11, %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 32(%r9), %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm0
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm2
+; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm6
+; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 32(%rcx), %ymm2
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm6[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm2[1],ymm7[2],ymm2[3],ymm7[4],ymm2[5],ymm7[6],ymm2[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm6
+; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm7
+; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[6],ymm7[6],ymm6[7],ymm7[7]
+; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovaps 32(%r8), %ymm9
+; AVX2-FCP-NEXT: vpermps %ymm9, %ymm0, %ymm7
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 52(%r9), %ymm7
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd 64(%rcx), %xmm2
+; AVX2-FCP-NEXT: vpbroadcastd 64(%rdx), %xmm7
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 64(%r9), %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 52(%r9), %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd 64(%rcx), %xmm0
-; AVX2-FCP-NEXT: vpbroadcastd 64(%rdx), %xmm1
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastq %xmm4, %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 64(%r9), %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 64(%rdx), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 64(%rcx), %ymm0
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm2
+; AVX2-FCP-NEXT: vmovaps 64(%rdx), %ymm3
+; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 64(%rcx), %ymm2
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 64(%rsi), %ymm4
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
+; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovaps 64(%r8), %ymm8
+; AVX2-FCP-NEXT: vpermps %ymm8, %ymm0, %ymm3
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 84(%r9), %ymm3
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm15[0],xmm5[1],xmm15[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastq %xmm10, %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 96(%r9), %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 84(%r9), %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastq %xmm8, %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 96(%r9), %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 96(%rdx), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 96(%rcx), %ymm2
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm3
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 116(%r9), %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd 128(%rcx), %xmm0
-; AVX2-FCP-NEXT: vpbroadcastd 128(%rdx), %xmm4
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastq %xmm5, %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 128(%r9), %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 128(%rdx), %ymm12
-; AVX2-FCP-NEXT: vmovdqa 128(%rcx), %ymm9
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm12[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 128(%rsi), %ymm3
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 148(%r9), %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastq %xmm6, %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 160(%r9), %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 160(%rdx), %ymm10
-; AVX2-FCP-NEXT: vmovdqa 160(%rcx), %ymm7
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm7[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 160(%rsi), %ymm3
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 180(%r9), %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; AVX2-FCP-NEXT: vpbroadcastd %xmm11, %xmm0
+; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm3
+; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm2
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm4
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
+; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovaps 96(%r8), %ymm7
+; AVX2-FCP-NEXT: vpermps %ymm7, %ymm0, %ymm3
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 116(%r9), %ymm3
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd 128(%rcx), %xmm2
+; AVX2-FCP-NEXT: vpbroadcastd 128(%rdx), %xmm3
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT: vpbroadcastq %xmm12, %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 128(%r9), %ymm3
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 128(%rdx), %ymm3
+; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 128(%rcx), %ymm2
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm3
+; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 128(%rsi), %ymm4
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
+; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovaps 128(%r8), %ymm3
+; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 148(%r9), %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; AVX2-FCP-NEXT: vpbroadcastd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm4 = xmm13[0],mem[0],xmm13[1],mem[1]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 192(%r9), %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 192(%rdx), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 192(%rcx), %ymm0
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 192(%rsi), %ymm3
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 212(%r9), %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vbroadcastss 224(%rcx), %xmm0
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vpbroadcastd 160(%r9), %ymm4
+; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm4
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 160(%rcx), %ymm2
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm4
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm5
+; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovaps 160(%r8), %ymm6
+; AVX2-FCP-NEXT: vpermps %ymm6, %ymm0, %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 180(%r9), %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm2
+; AVX2-FCP-NEXT: vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 192(%r9), %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm4
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 192(%rcx), %ymm2
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm4
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm5
+; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm10
+; AVX2-FCP-NEXT: vpermps %ymm10, %ymm0, %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 212(%r9), %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vbroadcastss 224(%rcx), %xmm2
; AVX2-FCP-NEXT: vbroadcastss 224(%rdx), %xmm4
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm4 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
; AVX2-FCP-NEXT: vbroadcastsd (%rsp), %ymm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
; AVX2-FCP-NEXT: vbroadcastss 224(%r9), %ymm4
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 224(%rdx), %ymm4
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 224(%rcx), %ymm11
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,2,2,4,5,6,6]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,2,3,5,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm4
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm5
+; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
+; AVX2-FCP-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovaps 224(%r8), %ymm4
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vpermps %ymm4, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 244(%r9), %ymm2
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 224(%rdx), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 224(%rcx), %ymm8
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm8[0,1,2,2,4,5,6,6]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[1,1,2,3,5,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa 224(%rsi), %ymm3
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 244(%r9), %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm4
-; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0
-; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3]
-; AVX2-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm15
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm4[2,3,4,5],ymm15[6,7]
-; AVX2-FCP-NEXT: vmovdqa (%r9), %ymm15
-; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm14[0],ymm5[0],ymm14[1],ymm5[1],ymm14[4],ymm5[4],ymm14[5],ymm5[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 16(%r9), %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm3 = ymm14[2],ymm5[2],ymm14[3],ymm5[3],ymm14[6],ymm5[6],ymm14[7],ymm5[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = mem[2,3],ymm3[2,3]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [6,5,3,3,6,5,7,7]
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm6, %ymm0
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [4,6,2,3,4,6,6,7]
-; AVX2-FCP-NEXT: vpermd %ymm15, %ymm5, %ymm15
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX2-FCP-NEXT: vmovdqa 32(%r8), %ymm15
-; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7]
-; AVX2-FCP-NEXT: vmovdqa 32(%r9), %ymm13
-; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4,5,6],ymm14[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 48(%r9), %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-FCP-NEXT: vpermd %ymm15, %ymm6, %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7]
-; AVX2-FCP-NEXT: vpermd %ymm13, %ymm5, %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX2-FCP-NEXT: vmovdqa 64(%r8), %ymm13
-; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7]
-; AVX2-FCP-NEXT: vmovdqa 64(%r9), %ymm14
-; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm15
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 80(%r9), %ymm15
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-FCP-NEXT: vpermd %ymm13, %ymm6, %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7]
-; AVX2-FCP-NEXT: vpermd %ymm14, %ymm5, %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
-; AVX2-FCP-NEXT: vmovdqa 96(%r8), %ymm13
-; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5],ymm14[6,7]
-; AVX2-FCP-NEXT: vmovdqa 96(%r9), %ymm14
-; AVX2-FCP-NEXT: vpermd %ymm14, %ymm1, %ymm15
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm15[1],ymm0[2,3,4,5,6],ymm15[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 112(%r9), %ymm15
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-FCP-NEXT: vpermd %ymm13, %ymm6, %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpermd %ymm14, %ymm5, %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX2-FCP-NEXT: vmovdqa 128(%r8), %ymm2
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm0[2,3,4,5],ymm13[6,7]
-; AVX2-FCP-NEXT: vmovdqa 128(%r9), %ymm3
-; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm14 = ymm12[0],ymm9[0],ymm12[1],ymm9[1],ymm12[4],ymm9[4],ymm12[5],ymm9[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm14 = ymm14[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 144(%r9), %ymm14
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm12[2],ymm9[2],ymm12[3],ymm9[3],ymm12[6],ymm9[6],ymm12[7],ymm9[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX2-FCP-NEXT: vmovdqa 160(%r8), %ymm3
-; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vmovdqa 160(%r9), %ymm2
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm0[0],ymm4[1],ymm0[2,3,4,5,6],ymm4[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm4 = ymm10[0],ymm7[0],ymm10[1],ymm7[1],ymm10[4],ymm7[4],ymm10[5],ymm7[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 176(%r9), %ymm4
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = mem[2,3],ymm0[2,3]
-; AVX2-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
-; AVX2-FCP-NEXT: vmovdqa 192(%r8), %ymm2
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vmovdqa 192(%r9), %ymm3
-; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm10
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm10[1],ymm0[2,3,4,5,6],ymm10[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 208(%r9), %ymm15
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm15 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm15 = mem[2,3],ymm15[2,3]
-; AVX2-FCP-NEXT: vpermd %ymm2, %ymm6, %ymm2
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2,3,4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpermd %ymm3, %ymm5, %ymm3
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4,5,6],ymm3[7]
-; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm11, %ymm3
-; AVX2-FCP-NEXT: vmovdqa 224(%r8), %ymm15
-; AVX2-FCP-NEXT: vpermd %ymm15, %ymm1, %ymm13
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm3[2,3,4,5],ymm13[6,7]
-; AVX2-FCP-NEXT: vmovdqa 224(%r9), %ymm13
-; AVX2-FCP-NEXT: vpermd %ymm13, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4,5,6],ymm1[7]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
-; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} ymm11 = ymm0[0],ymm8[0],ymm0[1],ymm8[1],ymm0[4],ymm8[4],ymm0[5],ymm8[5]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,2,2,2]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vpbroadcastd 240(%r9), %ymm11
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm11[5],ymm3[6,7]
-; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} ymm8 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7]
-; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vperm2i128 $19, (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [2,2,3,3,2,2,3,3]
+; AVX2-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX2-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm15
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT: vmovaps (%r9), %ymm15
+; AVX2-FCP-NEXT: vpermps %ymm15, %ymm5, %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6],ymm14[7]
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm2 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm13[0],ymm4[0],ymm13[1],ymm4[1],ymm13[4],ymm4[4],ymm13[5],ymm4[5]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 16(%r9), %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm14[5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm14 = mem[2,3],ymm2[2,3]
+; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [6,5,3,3,6,5,7,7]
+; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm1
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1],ymm14[2,3,4,5],ymm1[6,7]
+; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm2 = [4,6,2,3,4,6,6,7]
+; AVX2-FCP-NEXT: vpermps %ymm15, %ymm2, %ymm15
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6],ymm15[7]
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm14 = xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm14, %ymm1, %ymm14
+; AVX2-FCP-NEXT: vpermps %ymm9, %ymm5, %ymm15
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT: vmovaps 32(%r9), %ymm15
+; AVX2-FCP-NEXT: vpermps %ymm15, %ymm5, %ymm13
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm13 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2,3],ymm13[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm9[4,5],ymm13[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 48(%r9), %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5],ymm13[6,7]
+; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm13 = mem[2,3],ymm13[2,3]
+; AVX2-FCP-NEXT: vpermps %ymm9, %ymm4, %ymm9
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm13[2,3,4,5],ymm9[6,7]
+; AVX2-FCP-NEXT: vpermps %ymm15, %ymm2, %ymm13
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm13[1],ymm9[2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm9 = xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm9
+; AVX2-FCP-NEXT: vpermps %ymm8, %ymm5, %ymm13
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3,4,5],ymm13[6,7]
+; AVX2-FCP-NEXT: vmovaps 64(%r9), %ymm13
+; AVX2-FCP-NEXT: vpermps %ymm13, %ymm5, %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm14[1],ymm9[2,3,4,5,6],ymm14[7]
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm9 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm14[2,3],ymm9[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 80(%r9), %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm14[5],ymm9[6,7]
+; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm9 = mem[2,3],ymm9[2,3]
+; AVX2-FCP-NEXT: vpermps %ymm8, %ymm4, %ymm8
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4,5],ymm8[6,7]
+; AVX2-FCP-NEXT: vpermps %ymm13, %ymm2, %ymm9
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7]
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm8 = xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,3,2,3]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm1, %ymm8
+; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm9
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7]
+; AVX2-FCP-NEXT: vmovaps 96(%r9), %ymm9
+; AVX2-FCP-NEXT: vpermps %ymm9, %ymm5, %ymm13
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm8 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,2]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3],ymm8[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm7[4,5],ymm8[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 112(%r9), %ymm13
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
+; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm8 = mem[2,3],ymm8[2,3]
-; AVX2-FCP-NEXT: vpermd %ymm15, %ymm6, %ymm6
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5],ymm6[6,7]
-; AVX2-FCP-NEXT: vpermd %ymm13, %ymm5, %ymm5
-; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4,5,6],ymm5[7]
+; AVX2-FCP-NEXT: vpermps %ymm7, %ymm4, %ymm7
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3,4,5],ymm7[6,7]
+; AVX2-FCP-NEXT: vpermps %ymm9, %ymm2, %ymm8
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7]
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,3,2,3]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7
+; AVX2-FCP-NEXT: vpermps %ymm3, %ymm5, %ymm8
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7]
+; AVX2-FCP-NEXT: vmovaps 128(%r9), %ymm8
+; AVX2-FCP-NEXT: vpermps %ymm8, %ymm5, %ymm9
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm9[1],ymm7[2,3,4,5,6],ymm9[7]
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm7 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,2]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5],ymm7[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 144(%r9), %ymm9
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
+; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm7 = mem[2,3],ymm7[2,3]
+; AVX2-FCP-NEXT: vpermps %ymm3, %ymm4, %ymm3
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3,4,5],ymm3[6,7]
+; AVX2-FCP-NEXT: vpermps %ymm8, %ymm2, %ymm7
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm3 = xmm1[2],mem[2],xmm1[3],mem[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3
+; AVX2-FCP-NEXT: vpermps %ymm6, %ymm5, %ymm7
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3,4,5],ymm7[6,7]
+; AVX2-FCP-NEXT: vmovaps 160(%r9), %ymm13
+; AVX2-FCP-NEXT: vpermps %ymm13, %ymm5, %ymm7
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0],ymm7[1],ymm3[2,3,4,5,6],ymm7[7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm14[2,3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 176(%r9), %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3,4],ymm14[5],ymm3[6,7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm14 = mem[2,3],ymm14[2,3]
+; AVX2-FCP-NEXT: vpermps %ymm6, %ymm4, %ymm6
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3,4,5],ymm6[6,7]
+; AVX2-FCP-NEXT: vpermps %ymm13, %ymm2, %ymm13
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4,5,6],ymm13[7]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm13 = xmm1[2],xmm12[2],xmm1[3],xmm12[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm13
+; AVX2-FCP-NEXT: vpermps %ymm10, %ymm5, %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7]
+; AVX2-FCP-NEXT: vmovaps 192(%r9), %ymm1
+; AVX2-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm13 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3],ymm13[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5],ymm13[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 208(%r9), %ymm15
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm12[2],ymm0[2],ymm12[3],ymm0[3],ymm12[6],ymm0[6],ymm12[7],ymm0[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm13 = mem[2,3],ymm13[2,3]
+; AVX2-FCP-NEXT: vpermps %ymm10, %ymm4, %ymm10
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3,4,5],ymm10[6,7]
+; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0],ymm1[1],ymm10[2,3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,3,2,3]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm13
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3,4,5],ymm13[6,7]
+; AVX2-FCP-NEXT: vmovaps 224(%r9), %ymm13
+; AVX2-FCP-NEXT: vpermps %ymm13, %ymm5, %ymm0
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4,5,6],ymm0[7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm10 = ymm5[0],mem[0],ymm5[1],mem[1],ymm5[4],mem[4],ymm5[5],mem[5]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm5[0],ymm11[0],ymm5[1],ymm11[1],ymm5[4],ymm11[4],ymm5[5],ymm11[5]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm1[4,5],ymm10[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 240(%r9), %ymm12
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm5[2],ymm11[2],ymm5[3],ymm11[3],ymm5[6],ymm11[6],ymm5[7],ymm11[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vperm2f128 $19, (%rsp), %ymm11, %ymm11 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm11 = mem[2,3],ymm11[2,3]
+; AVX2-FCP-NEXT: vpermps %ymm1, %ymm4, %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm11[2,3,4,5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpermps %ymm13, %ymm2, %ymm2
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4,5,6],ymm2[7]
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT: vmovdqa %ymm5, 1504(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm3, 1440(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm1, 1408(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm2, 1312(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm4, 1248(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm10, 1216(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm7, 1120(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm9, 1056(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm12, 1024(%rax)
-; AVX2-FCP-NEXT: vmovdqa %ymm14, 928(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm2, 1504(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm10, 1440(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm0, 1408(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm3, 1312(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm15, 1248(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm14, 1216(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm6, 1120(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm7, 1056(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm8, 1024(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm9, 928(%rax)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm0, 864(%rax)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -9809,7 +9833,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax)
-; AVX2-FCP-NEXT: addq $2376, %rsp # imm = 0x948
+; AVX2-FCP-NEXT: addq $2600, %rsp # imm = 0xA28
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index 62e2aadd818c1..03f3952e016ff 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -52,9 +52,8 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT: vmovddup {{.*#+}} xmm3 = mem[0,0]
; AVX-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero
; AVX-NEXT: vmovq {{.*#+}} xmm5 = mem[0],zero
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0]
@@ -64,13 +63,14 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,0],ymm0[1,0],ymm1[7,4],ymm0[5,4]
; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2],ymm0[2,1],ymm6[4,6],ymm0[6,5]
-; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10,11]
+; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm3[12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10,11]
; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm4[3,3]
; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3]
; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,3],ymm1[4,6],ymm0[4,7]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm1
-; AVX-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[u,u,0,2,u,u,u,5]
-; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[2,3],ymm2[4,6],ymm1[6,7]
+; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2
+; AVX-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[u,u,0,2,u,u,u,5]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7]
; AVX-NEXT: vmovaps %ymm0, (%rax)
; AVX-NEXT: vextractf128 $1, %ymm6, %xmm0
@@ -499,10 +499,10 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3]
; AVX-NEXT: vbroadcastss 12(%rcx), %xmm2
; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX-NEXT: vmovaps %xmm0, 96(%rax)
; AVX-NEXT: vmovaps %ymm3, (%rax)
; AVX-NEXT: vmovaps %ymm9, 64(%rax)
+; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
+; AVX-NEXT: vmovaps %xmm0, 96(%rax)
; AVX-NEXT: vmovaps %ymm6, 32(%rax)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
@@ -6182,7 +6182,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovaps (%rax), %xmm0
; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vmovaps 32(%rax), %xmm3
-; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FCP-NEXT: vmovaps (%r8), %xmm2
; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -6196,21 +6196,20 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3]
; AVX2-FCP-NEXT: vbroadcastsd %xmm1, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
-; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm2
-; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FCP-NEXT: vmovaps (%rcx), %xmm9
; AVX2-FCP-NEXT: vmovaps 32(%rcx), %xmm6
; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vmovaps (%rdx), %xmm1
; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm2[1],zero
-; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm9
+; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm1[1],xmm9[1],zero
+; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm14
; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm7
; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm11
+; AVX2-FCP-NEXT: vmovaps (%rsi), %xmm13
; AVX2-FCP-NEXT: vmovaps 32(%rsi), %xmm8
; AVX2-FCP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,2,2]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm9[2],xmm2[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm13[1,1,2,2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm14[2],xmm2[3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7]
@@ -6257,9 +6256,9 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovaps 96(%r8), %xmm1
; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FCP-NEXT: vmovaps 96(%r9), %xmm7
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[1,1,1,1]
-; AVX2-FCP-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill
+; AVX2-FCP-NEXT: vmovaps 96(%r9), %xmm5
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
+; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
; AVX2-FCP-NEXT: vbroadcastsd %xmm0, %ymm0
; AVX2-FCP-NEXT: vmovaps 96(%rax), %xmm1
@@ -6281,34 +6280,34 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2],ymm1[3,4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0
+; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm11
+; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm0
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps (%rsi), %ymm13
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm13[2],ymm0[3],ymm13[3],ymm0[6],ymm13[6],ymm0[7],ymm13[7]
-; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm0[2],ymm11[3],ymm0[3],ymm11[6],ymm0[6],ymm11[7],ymm0[7]
+; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
-; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm6
+; AVX2-FCP-NEXT: vmovaps (%rdx), %ymm8
; AVX2-FCP-NEXT: vmovaps (%rcx), %ymm10
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[6],ymm10[6],ymm6[7],ymm10[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm10[2],ymm8[3],ymm10[3],ymm8[6],ymm10[6],ymm8[7],ymm10[7]
; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovaps (%r8), %ymm8
-; AVX2-FCP-NEXT: vmovaps (%r9), %ymm14
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1,2,2,5,5,6,6]
-; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4,5],ymm8[6],ymm1[7]
; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX2-FCP-NEXT: vmovaps (%r8), %ymm7
+; AVX2-FCP-NEXT: vmovaps (%r9), %ymm6
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1,2,2,5,5,6,6]
+; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4,5],ymm7[6],ymm1[7]
+; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FCP-NEXT: vmovaps 16(%rax), %xmm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1
-; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm0
+; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
+; AVX2-FCP-NEXT: vmovaps 32(%rsi), %ymm1
+; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FCP-NEXT: vmovaps 32(%rdx), %ymm1
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -6350,47 +6349,49 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm4
+; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm3
; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm2
; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,2,2,2]
-; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm5
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm4
; AVX2-FCP-NEXT: vmovaps 96(%rcx), %ymm1
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm5[1,1],ymm1[1,1],ymm5[5,5],ymm1[5,5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm15[5,6],ymm3[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,1],ymm1[1,1],ymm4[5,5],ymm1[5,5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5,6],ymm0[7]
; AVX2-FCP-NEXT: vbroadcastsd 112(%r8), %ymm15
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0],ymm3[1,2,3,4,5,6],ymm15[7]
-; AVX2-FCP-NEXT: vbroadcastss 112(%r9), %xmm15
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4,5,6,7]
-; AVX2-FCP-NEXT: vbroadcastss 112(%rax), %ymm15
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm15[2],ymm3[3,4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7]
+; AVX2-FCP-NEXT: vmovaps 96(%r9), %ymm15
+; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm12 = [4,4,5,5]
+; AVX2-FCP-NEXT: vpermps %ymm15, %ymm12, %ymm12
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 112(%rax), %ymm12
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm12[2],ymm0[3,4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vbroadcastss 112(%rdx), %ymm3
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6],ymm15[7]
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm15[4,5],ymm3[6,7]
-; AVX2-FCP-NEXT: vbroadcastss 108(%r8), %ymm15
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm15[1],ymm3[2,3,4,5,6,7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm7[2,2,3,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm15[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7]
+; AVX2-FCP-NEXT: vbroadcastss 112(%rdx), %ymm0
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm12 = ymm1[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6],ymm12[7]
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 108(%r8), %ymm12
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm12 = xmm5[2,2,3,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm12 = [5,6,5,6,5,6,5,6]
+; AVX2-FCP-NEXT: vpermps %ymm15, %ymm12, %ymm12
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[6],ymm1[6],ymm5[7],ymm1[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7]
-; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm12 = [5,6,5,6,5,6,5,6]
-; AVX2-FCP-NEXT: vpermps 96(%r9), %ymm12, %ymm12
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm15[1,2,3,4,5,6],ymm12[7]
; AVX2-FCP-NEXT: vmovaps 96(%rax), %ymm15
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm15[3],ymm5[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm15[2,3],ymm0[2,3]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[6],ymm4[6],ymm2[7],ymm4[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[6],ymm4[6],ymm1[7],ymm4[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -6401,16 +6402,15 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vbroadcastsd 120(%rax), %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps %xmm9, %xmm15
+; AVX2-FCP-NEXT: vbroadcastss %xmm9, %xmm0
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm0
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1
+; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm1
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-FCP-NEXT: vmovaps %xmm11, %xmm15
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
-; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm11 = [0,1,2,2,0,1,2,2]
-; AVX2-FCP-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
+; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,1,2,2,0,1,2,2]
+; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
@@ -6420,26 +6420,25 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[3,3],xmm15[3,3]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm14[3,3],xmm13[3,3]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm12[2],xmm15[2],xmm12[3],xmm15[3]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7]
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,2,2,2]
-; AVX2-FCP-NEXT: vmovaps %xmm5, %xmm9
; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3]
; AVX2-FCP-NEXT: vbroadcastsd 8(%rax), %ymm3
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7]
+; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm1 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,1],ymm10[1,1],ymm6[5,5],ymm10[5,5]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1],ymm10[1,1],ymm8[5,5],ymm10[5,5]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,1,4,5,4,5]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,0,0,0,4,4,4,4]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,0,1,4,5,4,5]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
; AVX2-FCP-NEXT: vbroadcastsd 16(%rax), %ymm3
@@ -6454,20 +6453,20 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: vbroadcastsd (%rsp), %ymm3 # 16-byte Folded Reload
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7]
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm5[3,3]
; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm0[2],xmm6[3],xmm0[3]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7]
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[3]
@@ -6494,125 +6493,126 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm1
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm2
+; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm2
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6],ymm1[7]
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[3,3],xmm0[3,3]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FCP-NEXT: vpermps %ymm2, %ymm11, %ymm2
+; AVX2-FCP-NEXT: vpermps %ymm2, %ymm9, %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm6[2,2,2,2]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3]
; AVX2-FCP-NEXT: vbroadcastsd 72(%rax), %ymm3
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[2,3,4],ymm15[5],ymm1[6,7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1],ymm10[1,1],ymm13[5,5],ymm10[5,5]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1],ymm10[1,1],ymm11[5,5],ymm10[5,5]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7]
-; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,0,0,0,4,4,4,4]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,1,0,1,4,5,4,5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
-; AVX2-FCP-NEXT: vbroadcastsd 80(%rax), %ymm3
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastsd 80(%rax), %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm3
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX2-FCP-NEXT: vbroadcastss %xmm5, %xmm4
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FCP-NEXT: vpermps %ymm3, %ymm11, %ymm3
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6],ymm8[7]
-; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3],xmm1[3,3]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-FCP-NEXT: vpermps %ymm4, %ymm11, %ymm4
-; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm3[5,6],ymm4[7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,2,2,2]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3]
-; AVX2-FCP-NEXT: vbroadcastsd 104(%rax), %ymm4
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm3[2,3,4],ymm7[5,6,7]
-; AVX2-FCP-NEXT: vbroadcastss 16(%rdx), %ymm3
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FCP-NEXT: vpermps %ymm4, %ymm9, %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6],ymm8[7]
+; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3],xmm1[3,3]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm6 = xmm5[2],xmm7[2],xmm5[3],xmm7[3]
+; AVX2-FCP-NEXT: vpermps %ymm6, %ymm9, %ymm6
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0,1,2,3,4],ymm4[5,6],ymm6[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,2,2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3]
+; AVX2-FCP-NEXT: vbroadcastsd 104(%rax), %ymm6
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm4[2,3,4],ymm7[5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 16(%rdx), %ymm4
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm0[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6],ymm4[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm0[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6],ymm6[7]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,3],xmm9[3,3]
-; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm4 = xmm4[0,1,2],mem[3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm4[1,2,3],ymm3[4,5,6,7]
-; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm1[2],ymm7[3],ymm1[3],ymm7[6],ymm1[6],ymm7[7],ymm1[7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3]
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm6 = xmm2[3,3],mem[3,3]
+; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm6 = xmm6[0,1,2],mem[3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[6],ymm7[6],ymm1[7],ymm7[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm4 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm6 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0],ymm4[1,2],ymm7[3,4],ymm4[5,6],ymm7[7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7]
-; AVX2-FCP-NEXT: vbroadcastss 48(%rdx), %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 48(%rdx), %ymm6
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm0[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6],ymm7[7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7]
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm7 = xmm5[3,3],mem[3,3]
-; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm1[0],ymm8[1],ymm1[1],ymm8[4],ymm1[4],ymm8[5],ymm1[5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm7 = xmm3[3,3],mem[3,3]
+; AVX2-FCP-NEXT: vblendps $8, (%rsp), %xmm7, %xmm7 # 16-byte Folded Reload
; AVX2-FCP-NEXT: # xmm7 = xmm7[0,1,2],mem[3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7]
; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm7 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[3,3,3,3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7]
@@ -6626,7 +6626,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vbroadcastss 80(%rdx), %ymm8
; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6],ymm9[7]
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[4],ymm12[4],ymm14[5],ymm12[5]
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[4],ymm13[4],ymm15[5],ymm13[5]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
@@ -6634,13 +6634,12 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
; AVX2-FCP-NEXT: # xmm9 = xmm9[0,1,2],mem[3]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5,6,7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[6],ymm14[6],ymm12[7],ymm14[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm10[2],ymm11[2],ymm10[3],ymm11[3],ymm10[6],ymm11[6],ymm10[7],ymm11[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm15[2],ymm13[3],ymm15[3],ymm13[6],ymm15[6],ymm13[7],ymm15[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7]
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm10 = ymm15[3,3],mem[3,3],ymm15[7,7],mem[7,7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm12[3,3],ymm14[3,3],ymm12[7,7],ymm14[7,7]
; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm11 = mem[2,3,2,3,6,7,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4],ymm10[5,6],ymm11[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3]
@@ -6649,10 +6648,10 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovaps %ymm9, 640(%rax)
; AVX2-FCP-NEXT: vmovaps %ymm8, 544(%rax)
; AVX2-FCP-NEXT: vmovaps %ymm7, 416(%rax)
-; AVX2-FCP-NEXT: vmovaps %ymm4, 320(%rax)
-; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm6, 320(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm4, 192(%rax)
; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax)
-; AVX2-FCP-NEXT: vmovaps %ymm6, 736(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm5, 736(%rax)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovaps %ymm0, 672(%rax)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -13003,12 +13002,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovaps (%r8), %ymm15
-; AVX2-FCP-NEXT: vmovaps (%r9), %ymm13
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6]
+; AVX2-FCP-NEXT: vmovaps (%r8), %ymm13
+; AVX2-FCP-NEXT: vmovaps (%r9), %ymm1
+; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm13[2],ymm1[3,4,5],ymm13[6],ymm1[7]
; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm15[2],ymm1[3,4,5],ymm15[6],ymm1[7]
-; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FCP-NEXT: vmovaps 16(%rax), %xmm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7]
@@ -13060,11 +13059,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0
-; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm1
+; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT: vmovaps 96(%rsi), %ymm0
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FCP-NEXT: vmovaps 96(%rdx), %ymm1
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13106,11 +13105,11 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0
-; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm1
+; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm1
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT: vmovaps 160(%rsi), %ymm0
+; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FCP-NEXT: vmovaps 160(%rdx), %ymm1
; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13129,17 +13128,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm10
-; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm12
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7]
-; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm11
+; AVX2-FCP-NEXT: vmovaps 192(%rsi), %ymm8
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm8[2],ymm11[3],ymm8[3],ymm11[6],ymm8[6],ymm11[7],ymm8[7]
+; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FCP-NEXT: vmovaps 192(%rdx), %ymm7
-; AVX2-FCP-NEXT: vmovaps 192(%rcx), %ymm8
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7]
+; AVX2-FCP-NEXT: vmovaps 192(%rcx), %ymm12
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm12[2],ymm7[3],ymm12[3],ymm7[6],ymm12[6],ymm7[7],ymm12[7]
; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-FCP-NEXT: vmovaps 192(%r8), %ymm2
; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -13157,99 +13156,101 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1]
-; AVX2-FCP-NEXT: vmovaps 224(%rcx), %xmm3
+; AVX2-FCP-NEXT: vmovaps 224(%rcx), %xmm4
; AVX2-FCP-NEXT: vmovaps 224(%rdx), %xmm6
-; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm6[1],xmm3[1],zero
+; AVX2-FCP-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm6[1],xmm4[1],zero
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1,2],ymm2[3,4,5,6,7]
-; AVX2-FCP-NEXT: vbroadcastss 228(%r8), %ymm4
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT: vmovaps 224(%r9), %xmm4
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm11 = xmm4[1,1,1,1]
-; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 228(%r8), %ymm3
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vmovaps 224(%r9), %xmm3
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm14 = xmm3[1,1,1,1]
+; AVX2-FCP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7]
; AVX2-FCP-NEXT: vinsertf128 $1, 224(%rax), %ymm5, %ymm5
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7]
; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm2
+; AVX2-FCP-NEXT: vbroadcastss %xmm4, %xmm2
; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm5
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-FCP-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,1,2,2,0,1,2,2]
; AVX2-FCP-NEXT: # ymm9 = mem[0,1,0,1]
-; AVX2-FCP-NEXT: vpermps %ymm11, %ymm9, %ymm11
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT: vbroadcastsd 224(%r8), %ymm11
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5],ymm2[6,7]
-; AVX2-FCP-NEXT: vbroadcastss %xmm4, %ymm11
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5],ymm2[6,7]
-; AVX2-FCP-NEXT: vbroadcastss 224(%rax), %ymm11
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6],ymm2[7]
+; AVX2-FCP-NEXT: vpermps %ymm14, %ymm9, %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3],ymm14[4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastsd 224(%r8), %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vbroadcastss %xmm3, %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5],ymm2[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 224(%rax), %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm14[6],ymm2[7]
; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
; AVX2-FCP-NEXT: vmovaps 224(%r8), %ymm6
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vbroadcastss 232(%rax), %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2
-; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm1
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2]
-; AVX2-FCP-NEXT: vmovaps 224(%rdx), %ymm3
+; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm1
+; AVX2-FCP-NEXT: vmovaps 224(%rsi), %ymm14
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2]
+; AVX2-FCP-NEXT: vmovaps 224(%rdx), %ymm2
; AVX2-FCP-NEXT: vmovaps 224(%rcx), %ymm0
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm3[1,1],ymm0[1,1],ymm3[5,5],ymm0[5,5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6],ymm11[7]
-; AVX2-FCP-NEXT: vbroadcastsd 240(%r8), %ymm14
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7]
-; AVX2-FCP-NEXT: vbroadcastss 240(%r9), %xmm14
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7]
-; AVX2-FCP-NEXT: vbroadcastss 240(%rax), %ymm14
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2],ymm11[3,4,5,6,7]
-; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[6],ymm7[6],ymm8[7],ymm7[7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[6],ymm10[6],ymm12[7],ymm10[7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7]
-; AVX2-FCP-NEXT: vbroadcastss 220(%r8), %ymm14
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5],ymm11[6,7]
-; AVX2-FCP-NEXT: vbroadcastss 220(%r9), %ymm14
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7]
-; AVX2-FCP-NEXT: vbroadcastsd 216(%rax), %ymm14
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0],ymm11[1,2,3,4,5,6],ymm14[7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1],ymm0[1,1],ymm2[5,5],ymm0[5,5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm10[5,6],ymm4[7]
+; AVX2-FCP-NEXT: vbroadcastsd 240(%r8), %ymm10
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm4[1,2,3,4,5,6],ymm10[7]
+; AVX2-FCP-NEXT: vmovaps 224(%r9), %ymm4
+; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm15 = [4,4,5,5]
+; AVX2-FCP-NEXT: vpermps %ymm4, %ymm15, %ymm15
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 240(%rax), %ymm15
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm15[2],ymm10[3,4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[6],ymm7[6],ymm12[7],ymm7[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm8[2],ymm11[2],ymm8[3],ymm11[3],ymm8[6],ymm11[6],ymm8[7],ymm11[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2],ymm10[3,4,5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 220(%r8), %ymm15
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm15[5],ymm10[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 220(%r9), %ymm15
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm15[6,7]
+; AVX2-FCP-NEXT: vbroadcastsd 216(%rax), %ymm15
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0],ymm10[1,2,3,4,5,6],ymm15[7]
; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vbroadcastss 240(%rdx), %ymm11
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1,2,3,4,5],ymm11[6],ymm14[7]
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7]
-; AVX2-FCP-NEXT: vbroadcastss 236(%r8), %ymm14
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm14[1],ymm11[2,3,4,5,6,7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,3,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2,3],ymm11[4,5,6,7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm11 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7]
-; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm11 = [5,6,5,6,5,6,5,6]
-; AVX2-FCP-NEXT: vpermps 224(%r9), %ymm11, %ymm11
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4,5,6],ymm11[7]
-; AVX2-FCP-NEXT: vmovaps 224(%rax), %ymm11
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm11[3],ymm4[4,5,6,7]
-; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm11[2,3],ymm14[2,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
-; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
+; AVX2-FCP-NEXT: vbroadcastss 240(%rdx), %ymm10
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5],ymm10[6],ymm15[7]
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm1[0],ymm14[0],ymm1[1],ymm14[1],ymm1[4],ymm14[4],ymm1[5],ymm14[5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5],ymm10[6,7]
+; AVX2-FCP-NEXT: vbroadcastss 236(%r8), %ymm15
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm15[1],ymm10[2,3,4,5,6,7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5,6,7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm1[2],ymm14[2],ymm1[3],ymm14[3],ymm1[6],ymm14[6],ymm1[7],ymm14[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm15 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm15[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7]
+; AVX2-FCP-NEXT: vbroadcastsd {{.*#+}} ymm10 = [5,6,5,6,5,6,5,6]
+; AVX2-FCP-NEXT: vpermps %ymm4, %ymm10, %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2,3,4,5,6],ymm4[7]
+; AVX2-FCP-NEXT: vmovaps 224(%rax), %ymm6
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7]
+; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm15[2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7]
+; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[6],ymm1[6],ymm14[7],ymm1[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -13266,24 +13267,24 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vbroadcastss %xmm4, %xmm1
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm6[3,3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm6[3,3]
; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
; AVX2-FCP-NEXT: vbroadcastsd 8(%rax), %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -13298,39 +13299,40 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,0,1,4,5,4,5]
+; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
; AVX2-FCP-NEXT: vbroadcastsd 16(%rax), %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm0
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT: vbroadcastss %xmm4, %xmm0
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm1
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm8[0],xmm4[1],xmm8[1]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm6[3,3]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm8[2,2,2,2]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
; AVX2-FCP-NEXT: vbroadcastsd 40(%rax), %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7]
@@ -13354,30 +13356,30 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm0
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm1
+; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm0
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX2-FCP-NEXT: vbroadcastss %xmm5, %xmm1
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm3[3,3]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3]
; AVX2-FCP-NEXT: vbroadcastsd 72(%rax), %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
@@ -13412,9 +13414,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
@@ -13425,16 +13427,16 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
; AVX2-FCP-NEXT: vbroadcastsd 104(%rax), %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -13460,9 +13462,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
@@ -13473,20 +13475,20 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
; AVX2-FCP-NEXT: vbroadcastsd 136(%rax), %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm13[1,1],ymm1[5,5],ymm13[5,5]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7]
; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4]
@@ -13508,9 +13510,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
@@ -13521,16 +13523,16 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
; AVX2-FCP-NEXT: vbroadcastsd 168(%rax), %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5]
-; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FCP-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
@@ -13556,9 +13558,9 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
@@ -13569,213 +13571,215 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX2-FCP-NEXT: vpermps %ymm1, %ymm9, %ymm1
; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
; AVX2-FCP-NEXT: vbroadcastsd 200(%rax), %ymm2
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5]
-; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = mem[1,1,1,1,5,5,5,5]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7]
-; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4]
-; AVX2-FCP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; AVX2-FCP-NEXT: vbroadcastsd 208(%rax), %ymm2
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm11[1,1],ymm0[5,5],ymm11[5,5]
+; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7]
+; AVX2-FCP-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm2 = mem[0,0,0,0,4,4,4,4]
+; AVX2-FCP-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3,4],ymm2[5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3]
+; AVX2-FCP-NEXT: vbroadcastsd 208(%rax), %ymm4
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6],ymm2[7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vbroadcastss 16(%rdx), %ymm0
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm1 = ymm4[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm1 = xmm1[3,3],mem[3,3]
-; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm1 = xmm1[0,1,2],mem[3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6],ymm4[7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm4 = xmm4[3,3],mem[3,3]
+; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm4 = xmm4[0,1,2],mem[3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4,5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm5 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm5 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7]
; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vbroadcastss 48(%rdx), %ymm0
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7]
+; AVX2-FCP-NEXT: vbroadcastss 48(%rdx), %ymm4
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6],ymm6[7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm6 = xmm3[3,3],mem[3,3]
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm6 = xmm2[3,3],mem[3,3]
; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
; AVX2-FCP-NEXT: # xmm6 = xmm6[0,1,2],mem[3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm6 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm7 = mem[2,3,2,3,6,7,6,7]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2],ymm7[3,4],ymm6[5,6],ymm7[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7]
-; AVX2-FCP-NEXT: vbroadcastss 80(%rdx), %ymm0
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm6[0],ymm4[1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 80(%rdx), %ymm4
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6],ymm6[7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7]
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm6 = xmm3[3,3],mem[3,3]
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm6 = xmm2[3,3],mem[3,3]
; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
; AVX2-FCP-NEXT: # xmm6 = xmm6[0,1,2],mem[3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7]
-; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm0 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0],ymm6[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7]
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7]
-; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3,4],ymm6[5,6],ymm9[7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm6 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
+; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0],ymm6[1,2],ymm8[3,4],ymm6[5,6],ymm8[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7]
-; AVX2-FCP-NEXT: vbroadcastss 112(%rdx), %ymm1
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm4[1,2,3,4],ymm6[5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 112(%rdx), %ymm4
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm3[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6],ymm8[7]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm1[6],ymm9[7]
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5],ymm2[6,7]
+; AVX2-FCP-NEXT: vmovaps %ymm12, %ymm1
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm9 = xmm8[3,3],mem[3,3]
-; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm9 = xmm9[0,1,2],mem[3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1,2,3],ymm2[4,5,6,7]
-; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm9 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7]
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm8 = xmm8[3,3],mem[3,3]
+; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm8 = xmm8[0,1,2],mem[3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm4 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm8 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2],ymm4[3,4,5,6,7]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm10 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
-; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm13 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1,2],ymm13[3,4],ymm10[5,6],ymm13[7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4],ymm10[5,6,7]
-; AVX2-FCP-NEXT: vbroadcastss 144(%rdx), %ymm10
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5],ymm10[6],ymm13[7]
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5],ymm10[6,7]
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm8 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
+; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4],ymm8[5,6],ymm9[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0],ymm4[1,2,3,4],ymm8[5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 144(%rdx), %ymm4
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm8 = ymm13[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6],ymm8[7]
+; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm15[0],ymm3[1],ymm15[1],ymm3[4],ymm15[4],ymm3[5],ymm15[5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7]
; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm13 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm13 = xmm8[3,3],mem[3,3]
-; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm13 = xmm13[0,1,2],mem[3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1,2,3],ymm10[4,5,6,7]
-; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm14 = ymm15[2],ymm14[2],ymm15[3],ymm14[3],ymm15[6],ymm14[6],ymm15[7],ymm14[7]
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm8 = xmm8[3,3],mem[3,3]
+; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm8 = xmm8[0,1,2],mem[3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm8 = ymm13[2],mem[2],ymm13[3],mem[3],ymm13[6],mem[6],ymm13[7],mem[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm15[2],ymm3[2],ymm15[3],ymm3[3],ymm15[6],ymm3[6],ymm15[7],ymm3[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2],ymm8[3,4,5,6,7]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm14 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm13 = ymm0[3,3],mem[3,3],ymm0[7,7],mem[7,7]
; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm15 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3,4],ymm14[5,6,7]
-; AVX2-FCP-NEXT: vbroadcastss 176(%rdx), %ymm14
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3,4],ymm13[5,6],ymm15[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0],ymm8[1,2,3,4],ymm13[5,6,7]
+; AVX2-FCP-NEXT: vbroadcastss 176(%rdx), %ymm13
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6],ymm15[7]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
-; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[4],ymm0[4],ymm4[5],ymm0[5]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7]
-; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm15 = xmm8[3,3],mem[3,3]
+; AVX2-FCP-NEXT: vmovaps %ymm14, %ymm3
+; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm14[0],ymm0[0],ymm14[1],ymm0[1],ymm14[4],ymm0[4],ymm14[5],ymm0[5]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm15[4,5],ymm13[6,7]
+; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm15 = xmm14[3,3],mem[3,3]
; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
; AVX2-FCP-NEXT: # xmm15 = xmm15[0,1,2],mem[3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2,3],ymm14[4,5,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm15[1,2,3],ymm13[4,5,6,7]
; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm15 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
-; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[6],ymm4[6],ymm0[7],ymm4[7]
+; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3]
; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7]
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload
; AVX2-FCP-NEXT: # ymm15 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7]
-; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm8 = mem[2,3,2,3,6,7,6,7]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm15[1,2],ymm8[3,4],ymm15[5,6],ymm8[7]
-; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7]
-; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload
-; AVX2-FCP-NEXT: # ymm8 = ymm11[0],mem[0],ymm11[1],mem[1],ymm11[4],mem[4],ymm11[5],mem[5]
-; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm12[3,1,2,0,7,5,6,4]
-; AVX2-FCP-NEXT: vbroadcastss 208(%rdx), %ymm4
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm4[6],ymm15[7]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7]
+; AVX2-FCP-NEXT: vpermilps {{.*#+}} ymm14 = mem[2,3,2,3,6,7,6,7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1,2],ymm14[3,4],ymm15[5,6],ymm14[7]
+; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,2,3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4],ymm14[5,6,7]
+; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm14 # 32-byte Folded Reload
+; AVX2-FCP-NEXT: # ymm14 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5]
+; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm11[3,1,2,0,7,5,6,4]
+; AVX2-FCP-NEXT: vbroadcastss 208(%rdx), %ymm3
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6],ymm15[7]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5],ymm3[6,7]
; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm8 = xmm1[3,3],mem[3,3]
-; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
-; AVX2-FCP-NEXT: # xmm8 = xmm8[0,1,2],mem[3]
-; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm8[1,2,3],ymm4[4,5,6,7]
+; AVX2-FCP-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm14 = xmm1[3,3],mem[3,3]
+; AVX2-FCP-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
+; AVX2-FCP-NEXT: # xmm14 = xmm14[0,1,2],mem[3]
+; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1,2,3],ymm3[4,5,6,7]
; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-FCP-NEXT: vmovaps %ymm4, 1440(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm3, 1440(%rax)
; AVX2-FCP-NEXT: vmovaps %ymm0, 1312(%rax)
-; AVX2-FCP-NEXT: vmovaps %ymm14, 1216(%rax)
-; AVX2-FCP-NEXT: vmovaps %ymm13, 1088(%rax)
-; AVX2-FCP-NEXT: vmovaps %ymm10, 992(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm13, 1216(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm8, 1088(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm4, 992(%rax)
; AVX2-FCP-NEXT: vmovaps %ymm9, 864(%rax)
-; AVX2-FCP-NEXT: vmovaps %ymm2, 768(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm12, 768(%rax)
; AVX2-FCP-NEXT: vmovaps %ymm6, 640(%rax)
-; AVX2-FCP-NEXT: vmovaps %ymm3, 544(%rax)
+; AVX2-FCP-NEXT: vmovaps %ymm2, 544(%rax)
; AVX2-FCP-NEXT: vmovaps %ymm7, 416(%rax)
; AVX2-FCP-NEXT: vmovaps %ymm5, 320(%rax)
; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
index fba3aa50af23d..7e9739cbdbd39 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
@@ -464,7 +464,7 @@ define void @store_i32_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm8[3,0],ymm4[3,0],ymm8[7,4],ymm4[7,4]
; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0],ymm6[2,3],ymm4[6,4],ymm6[6,7]
; AVX-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0]
+; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3],xmm0[3,3]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7]
; AVX-NEXT: vmovaps %ymm0, 96(%rax)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
index 05c111ae5049f..8362ff7d777c4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
@@ -767,8 +767,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm1[0],mem[0],ymm1[2],mem[2]
; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm3[2,3],ymm12[4,5,6,7]
-; AVX-NEXT: vmovapd 48(%rsi), %xmm13
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm10 = xmm13[1],xmm10[1]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm10 = mem[0],xmm10[1]
; AVX-NEXT: vbroadcastsd 56(%rcx), %ymm13
; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3]
; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm10[0,1,2],ymm5[3]
@@ -1703,7 +1702,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3,4,5],ymm0[6,7]
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2]
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: vmovaps 32(%rdx), %xmm0
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -1712,7 +1711,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm1[0,1,2],ymm0[3]
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm0[0,1],ymm1[2,3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm1[2,3]
; AVX-NEXT: vmovaps 64(%rdi), %ymm0
; AVX-NEXT: vbroadcastsd 72(%rsi), %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
@@ -1735,24 +1734,23 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1],ymm12[2,3,4,5,6,7]
; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3],ymm6[4,5,6,7]
-; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
-; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1],ymm9[2,3],ymm15[4,5,6,7]
+; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5],ymm8[6,7]
+; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3]
-; AVX-NEXT: vmovapd 48(%rdx), %xmm6
-; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3]
-; AVX-NEXT: vmovapd 48(%rsi), %xmm8
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm8[1],xmm6[1]
-; AVX-NEXT: vbroadcastsd 56(%rcx), %ymm8
-; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3]
-; AVX-NEXT: vmovapd 32(%r8), %ymm8
-; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm8[0],ymm13[1,2,3]
-; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0],ymm8[1],ymm15[2,3]
-; AVX-NEXT: vmovupd %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm5[0,1],ymm8[2],ymm5[3]
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm8[3]
+; AVX-NEXT: vmovapd 48(%rdx), %xmm8
+; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm8[0],ymm5[0],ymm8[2],ymm5[3]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm8 = mem[0],xmm8[1]
+; AVX-NEXT: vbroadcastsd 56(%rcx), %ymm9
+; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3]
+; AVX-NEXT: vmovapd 32(%r8), %ymm9
+; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm9[0],ymm13[1,2,3]
+; AVX-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0],ymm9[1],ymm6[2,3]
+; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm5[0,1],ymm9[2],ymm5[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1,2],ymm9[3]
; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm7[0],mem[0]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
@@ -1766,8 +1764,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm2[2,3]
; AVX-NEXT: vmovapd 112(%rdx), %xmm2
; AVX-NEXT: vshufpd {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[2],ymm1[3]
-; AVX-NEXT: vmovapd 112(%rsi), %xmm1
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm2[1]
; AVX-NEXT: vbroadcastsd 120(%rcx), %ymm2
; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3]
; AVX-NEXT: vmovapd 96(%r8), %ymm0
@@ -3506,16 +3503,16 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-LABEL: store_i64_stride5_vf32:
; AVX: # %bb.0:
; AVX-NEXT: subq $1048, %rsp # imm = 0x418
-; AVX-NEXT: vmovaps 192(%rdi), %ymm9
-; AVX-NEXT: vmovapd 160(%rdi), %ymm7
-; AVX-NEXT: vmovapd 96(%rdi), %ymm5
+; AVX-NEXT: vmovaps 192(%rdi), %ymm10
+; AVX-NEXT: vmovapd 160(%rdi), %ymm9
+; AVX-NEXT: vmovapd 96(%rdi), %ymm14
; AVX-NEXT: vmovaps 128(%rcx), %ymm0
; AVX-NEXT: vmovaps (%rcx), %ymm1
; AVX-NEXT: vmovaps 64(%rcx), %ymm2
; AVX-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3]
-; AVX-NEXT: vmovaps 16(%rdx), %xmm6
-; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm6[1],ymm1[3],ymm6[3]
+; AVX-NEXT: vmovaps 16(%rdx), %xmm5
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm5[1],ymm1[3],ymm5[3]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
@@ -3524,7 +3521,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm3[1],ymm2[3],ymm3[3]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2]
; AVX-NEXT: vmovapd 96(%rcx), %xmm2
; AVX-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
@@ -3536,13 +3533,13 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm7[0],mem[0],ymm7[2],mem[2]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],mem[0],ymm9[2],mem[2]
; AVX-NEXT: vmovapd 160(%rcx), %xmm1
; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],mem[0],ymm9[2],mem[2]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],mem[0],ymm10[2],mem[2]
; AVX-NEXT: vmovaps 192(%rcx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
@@ -3567,32 +3564,32 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 32(%rdi), %ymm2
+; AVX-NEXT: vmovapd 32(%rdi), %ymm3
; AVX-NEXT: vbroadcastsd 40(%rsi), %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3]
; AVX-NEXT: vmovaps 32(%rdx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm3[0],mem[0],ymm3[2],mem[2]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 64(%rdi), %ymm14
+; AVX-NEXT: vmovaps 64(%rdi), %ymm2
; AVX-NEXT: vbroadcastsd 72(%rsi), %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovaps 64(%rdx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],mem[0],ymm14[2],mem[2]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],mem[0],ymm2[2],mem[2]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vbroadcastsd 104(%rsi), %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3]
; AVX-NEXT: vmovaps 96(%rdx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
@@ -3601,212 +3598,208 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovaps 128(%rdi), %ymm1
; AVX-NEXT: vbroadcastsd 136(%rsi), %ymm0
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vmovaps 128(%rdx), %xmm3
-; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT: vmovaps 128(%rdx), %xmm4
+; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm1[0],mem[0],ymm1[2],mem[2]
-; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm1[0],mem[0],ymm1[2],mem[2]
+; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm4[4,5,6,7]
; AVX-NEXT: vbroadcastsd 168(%rsi), %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
-; AVX-NEXT: vmovaps 160(%rdx), %xmm4
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm3
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3]
+; AVX-NEXT: vmovaps 160(%rdx), %xmm6
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm4
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vbroadcastsd 200(%rsi), %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vmovaps 192(%rdx), %xmm3
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6
-; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm6[6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovaps 192(%rdx), %xmm4
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm5
+; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3,4,5],ymm5[6,7]
; AVX-NEXT: vmovapd 224(%rdi), %ymm0
-; AVX-NEXT: vbroadcastsd 232(%rsi), %ymm6
-; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm0[0,1],ymm6[2,3]
-; AVX-NEXT: vmovaps 224(%rdx), %xmm11
-; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
-; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1,2],ymm11[3]
-; AVX-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,3,2,3]
+; AVX-NEXT: vbroadcastsd 232(%rsi), %ymm5
+; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm0[0,1],ymm5[2,3]
+; AVX-NEXT: vmovaps 224(%rdx), %xmm5
+; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm11
+; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm7[0,1,2],ymm11[3]
+; AVX-NEXT: vpermilps {{.*#+}} xmm7 = mem[2,3,2,3]
; AVX-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm15[2,3]
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm13[0],mem[0]
+; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3]
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm5[0],mem[0]
; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7]
; AVX-NEXT: vmovaps (%r8), %ymm15
-; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm15[6,7]
-; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = ymm15[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
-; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm15[4,5],ymm8[6,7]
-; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3]
+; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm15[6,7]
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = ymm15[0,1],mem[2,3,4,5,6,7]
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = mem[0,1],ymm15[2,3],mem[4,5,6,7]
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm15[4,5],ymm8[6,7]
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3]
; AVX-NEXT: vmovapd 48(%rdx), %xmm8
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[0],ymm2[0],ymm8[2],ymm2[3]
-; AVX-NEXT: vmovapd 48(%rsi), %xmm15
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm15[1],xmm8[1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm8[0],ymm3[0],ymm8[2],ymm3[3]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm8 = mem[0],xmm8[1]
; AVX-NEXT: vbroadcastsd 56(%rcx), %ymm15
; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm15[2,3]
; AVX-NEXT: vmovapd 32(%r8), %ymm15
-; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = ymm15[0],mem[1,2,3]
-; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = mem[0],ymm15[1],mem[2,3]
-; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm15[2],ymm2[3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm8[0,1,2],ymm15[3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
-; AVX-NEXT: vmovaps 64(%r8), %ymm8
-; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm8[6,7]
-; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps $252, (%rsp), %ymm8, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = ymm8[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT: vmovups %ymm13, (%rsp) # 32-byte Spill
-; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = mem[0,1],ymm8[2,3],mem[4,5,6,7]
-; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5],ymm2[6,7]
+; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = ymm15[0],mem[1,2,3]
+; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = mem[0],ymm15[1],mem[2,3]
+; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm15[2],ymm3[3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm8[0,1,2],ymm15[3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],mem[0]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
+; AVX-NEXT: vmovaps 64(%r8), %ymm3
+; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm3[6,7]
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps $252, (%rsp), %ymm3, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = ymm3[0,1],mem[2,3,4,5,6,7]
+; AVX-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill
+; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = mem[0,1],ymm3[2,3],mem[4,5,6,7]
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm5[2,3]
-; AVX-NEXT: vmovapd 112(%rdx), %xmm5
-; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[3]
-; AVX-NEXT: vmovapd 112(%rsi), %xmm8
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm8[1],xmm5[1]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm14[2,3]
+; AVX-NEXT: vmovapd 112(%rdx), %xmm3
+; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
; AVX-NEXT: vbroadcastsd 120(%rcx), %ymm8
-; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm8[2,3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3]
; AVX-NEXT: vmovapd 96(%r8), %ymm8
-; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = mem[0],ymm8[1],mem[2,3]
-; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = ymm8[0],mem[1,2,3]
-; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = mem[0],ymm8[1],mem[2,3]
+; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm5 # 32-byte Folded Reload
+; AVX-NEXT: # ymm5 = ymm8[0],mem[1,2,3]
+; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3]
; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1,2],ymm8[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm8[3]
; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],mem[0]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: vmovaps 128(%r8), %ymm2
-; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7]
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT: # ymm5 = ymm2[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1],ymm2[2,3],ymm10[4,5,6,7]
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm2[6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = ymm2[0,1],mem[2,3,4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm13[0,1],ymm2[2,3],ymm13[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm7[2,3]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm9[2,3]
; AVX-NEXT: vmovapd 176(%rdx), %xmm2
; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3]
-; AVX-NEXT: vmovapd 176(%rsi), %xmm5
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm5[1],xmm2[1]
-; AVX-NEXT: vbroadcastsd 184(%rcx), %ymm5
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3]
-; AVX-NEXT: vmovapd 160(%r8), %ymm5
-; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX-NEXT: # ymm7 = mem[0],ymm5[1],mem[2,3]
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload
-; AVX-NEXT: # ymm7 = ymm5[0],mem[1,2,3]
-; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
+; AVX-NEXT: vbroadcastsd 184(%rcx), %ymm3
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
+; AVX-NEXT: vmovapd 160(%r8), %ymm8
+; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = mem[0],ymm8[1],mem[2,3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = ymm8[0],mem[1,2,3]
+; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3]
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm5[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm8[3]
; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
; AVX-NEXT: vmovaps 192(%r8), %ymm2
-; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT: # ymm5 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
-; AVX-NEXT: # ymm5 = mem[0,1,2,3,4,5],ymm2[6,7]
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm12[2,3,4,5,6,7]
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = mem[0,1],ymm2[2,3],mem[4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
+; AVX-NEXT: # ymm3 = mem[0,1,2,3,4,5],ymm2[6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm12[2,3,4,5,6,7]
+; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
; AVX-NEXT: vmovapd 240(%rdx), %xmm1
; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
-; AVX-NEXT: vmovapd 240(%rsi), %xmm2
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
; AVX-NEXT: vbroadcastsd 248(%rcx), %ymm2
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
-; AVX-NEXT: vmovapd 224(%r8), %ymm5
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0],ymm11[1,2,3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0],ymm5[1],ymm6[2,3]
-; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3]
+; AVX-NEXT: vmovapd 224(%r8), %ymm15
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm15[0],ymm11[1,2,3]
+; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm7[0],ymm15[1],ymm7[2,3]
+; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm5[3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm15[3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT: # xmm7 = xmm0[0],mem[0]
-; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload
-; AVX-NEXT: # xmm5 = xmm4[0],mem[0]
-; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload
-; AVX-NEXT: # xmm2 = xmm3[0],mem[0]
-; AVX-NEXT: vmovaps 128(%rdi), %xmm1
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm1[0],mem[0]
+; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
+; AVX-NEXT: # xmm5 = xmm0[0],mem[0]
+; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
+; AVX-NEXT: # xmm6 = xmm6[0],mem[0]
+; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm4[0],mem[0]
+; AVX-NEXT: vmovaps 128(%rdi), %xmm4
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm4[0],mem[0]
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm4 = xmm0[0],mem[0]
-; AVX-NEXT: vmovaps 64(%rdi), %xmm14
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0]
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm0[0],mem[0]
+; AVX-NEXT: vmovaps 64(%rdi), %xmm12
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0]
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm0[0],mem[0]
-; AVX-NEXT: vmovaps 32(%rdi), %xmm12
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0]
+; AVX-NEXT: vmovaps 32(%rdi), %xmm8
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0]
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0]
-; AVX-NEXT: vmovaps 96(%rdi), %xmm15
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm15 = xmm15[0],mem[0]
-; AVX-NEXT: vmovaps 160(%rdi), %xmm13
+; AVX-NEXT: vmovaps 96(%rdi), %xmm13
; AVX-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
+; AVX-NEXT: vmovaps 160(%rdi), %xmm10
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
; AVX-NEXT: vmovaps 224(%rdi), %xmm9
; AVX-NEXT: vunpcklpd {{.*#+}} xmm9 = xmm9[0],mem[0]
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: vmovaps 192(%rdi), %xmm11
+; AVX-NEXT: vmovaps 192(%rdi), %xmm14
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm14 = xmm14[0],mem[0]
+; AVX-NEXT: vmovaps (%rdi), %xmm11
; AVX-NEXT: vunpcklpd {{.*#+}} xmm11 = xmm11[0],mem[0]
-; AVX-NEXT: vmovaps (%rdi), %xmm10
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm10 = xmm10[0],mem[0]
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm8 = xmm8[0],mem[0]
-; AVX-NEXT: vmovaps %xmm8, 16(%r9)
-; AVX-NEXT: vmovaps %xmm10, (%r9)
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0]
+; AVX-NEXT: vmovaps %xmm7, 16(%r9)
+; AVX-NEXT: vmovaps %xmm11, (%r9)
; AVX-NEXT: vmovaps %xmm2, 976(%r9)
-; AVX-NEXT: vmovaps %xmm11, 960(%r9)
+; AVX-NEXT: vmovaps %xmm14, 960(%r9)
; AVX-NEXT: vmovaps %xmm0, 1136(%r9)
; AVX-NEXT: vmovaps %xmm9, 1120(%r9)
-; AVX-NEXT: vmovaps %xmm5, 816(%r9)
-; AVX-NEXT: vmovaps %xmm13, 800(%r9)
-; AVX-NEXT: vmovaps %xmm7, 496(%r9)
-; AVX-NEXT: vmovaps %xmm15, 480(%r9)
+; AVX-NEXT: vmovaps %xmm6, 816(%r9)
+; AVX-NEXT: vmovaps %xmm10, 800(%r9)
+; AVX-NEXT: vmovaps %xmm5, 496(%r9)
+; AVX-NEXT: vmovaps %xmm13, 480(%r9)
; AVX-NEXT: vmovaps %xmm1, 176(%r9)
-; AVX-NEXT: vmovaps %xmm12, 160(%r9)
+; AVX-NEXT: vmovaps %xmm8, 160(%r9)
; AVX-NEXT: vmovaps %xmm3, 336(%r9)
-; AVX-NEXT: vmovaps %xmm14, 320(%r9)
-; AVX-NEXT: vmovaps %xmm4, 656(%r9)
-; AVX-NEXT: vmovaps %xmm6, 640(%r9)
+; AVX-NEXT: vmovaps %xmm12, 320(%r9)
+; AVX-NEXT: vmovaps %xmm15, 656(%r9)
+; AVX-NEXT: vmovaps %xmm4, 640(%r9)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, 1216(%r9)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -7394,16 +7387,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-LABEL: store_i64_stride5_vf64:
; AVX: # %bb.0:
; AVX-NEXT: subq $2264, %rsp # imm = 0x8D8
-; AVX-NEXT: vmovaps 192(%rdi), %ymm14
-; AVX-NEXT: vmovaps 160(%rdi), %ymm4
-; AVX-NEXT: vmovaps 96(%rdi), %ymm5
+; AVX-NEXT: vmovaps 192(%rdi), %ymm4
+; AVX-NEXT: vmovaps 160(%rdi), %ymm5
+; AVX-NEXT: vmovaps 96(%rdi), %ymm11
; AVX-NEXT: vmovaps 64(%rcx), %ymm1
; AVX-NEXT: vmovaps 128(%rcx), %ymm0
; AVX-NEXT: vmovaps (%rcx), %ymm2
; AVX-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,3,2,3]
-; AVX-NEXT: vmovaps 16(%rdx), %xmm6
-; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm6[1],ymm2[3],ymm6[3]
+; AVX-NEXT: vmovaps 16(%rdx), %xmm7
+; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm7[1],ymm2[3],ymm7[3]
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,3,2,3]
@@ -7412,10 +7405,10 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm3[1],ymm1[3],ymm3[3]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],mem[0],ymm5[2],mem[2]
-; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm11[0],mem[0],ymm11[2],mem[2]
+; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 96(%rcx), %xmm2
-; AVX-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -7425,16 +7418,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2]
-; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm5[0],mem[0],ymm5[2],mem[2]
+; AVX-NEXT: vmovaps %ymm5, %ymm3
+; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 160(%rcx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2]
-; AVX-NEXT: vmovaps %ymm14, %ymm2
-; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2]
+; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 192(%rcx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
@@ -7454,16 +7447,17 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 288(%rdi), %ymm14
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],mem[0],ymm14[2],mem[2]
-; AVX-NEXT: vmovapd 288(%rcx), %xmm1
-; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 320(%rdi), %ymm10
+; AVX-NEXT: vmovaps 288(%rdi), %ymm10
; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm10[0],mem[0],ymm10[2],mem[2]
; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 288(%rcx), %xmm1
+; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 320(%rdi), %ymm2
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm2[0],mem[0],ymm2[2],mem[2]
+; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps 320(%rcx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
@@ -7498,8 +7492,8 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 480(%rdi), %ymm13
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm13[0],mem[0],ymm13[2],mem[2]
+; AVX-NEXT: vmovapd 480(%rdi), %ymm15
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm15[0],mem[0],ymm15[2],mem[2]
; AVX-NEXT: vmovapd 480(%rcx), %xmm1
; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
@@ -7517,32 +7511,32 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm8[0],mem[0],ymm8[2],mem[2]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 32(%rdi), %ymm6
+; AVX-NEXT: vmovapd 32(%rdi), %ymm7
; AVX-NEXT: vbroadcastsd 40(%rsi), %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
; AVX-NEXT: vmovaps 32(%rdx), %xmm1
-; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2]
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 64(%rdi), %ymm7
+; AVX-NEXT: vmovaps 64(%rdi), %ymm6
; AVX-NEXT: vbroadcastsd 72(%rsi), %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovaps 64(%rdx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm7[0],mem[0],ymm7[2],mem[2]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm6[0],mem[0],ymm6[2],mem[2]
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vbroadcastsd 104(%rsi), %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovaps 96(%rdx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
@@ -7561,14 +7555,14 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vbroadcastsd 168(%rsi), %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovaps 160(%rdx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vbroadcastsd 200(%rsi), %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovaps 192(%rdx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
@@ -7599,14 +7593,14 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vbroadcastsd 296(%rsi), %ymm0
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovaps 288(%rdx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
+; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vbroadcastsd 328(%rsi), %ymm0
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovaps 320(%rdx), %xmm1
; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
@@ -7626,76 +7620,74 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vbroadcastsd 392(%rsi), %ymm0
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
-; AVX-NEXT: vmovaps 384(%rdx), %xmm15
-; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1
+; AVX-NEXT: vmovaps 384(%rdx), %xmm14
+; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovapd 416(%rdi), %ymm0
-; AVX-NEXT: vbroadcastsd 424(%rsi), %ymm1
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3]
+; AVX-NEXT: vmovapd 416(%rdi), %ymm1
+; AVX-NEXT: vbroadcastsd 424(%rsi), %ymm0
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
; AVX-NEXT: vmovaps 416(%rdx), %xmm9
; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3]
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm0[0],mem[0],ymm0[2],mem[2]
-; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm9[2,3]
-; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovaps 448(%rdi), %ymm1
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3]
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[2,3,2,3]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm1[0],mem[0],ymm1[2],mem[2]
+; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm9[2,3]
+; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovaps 448(%rdi), %ymm0
; AVX-NEXT: vbroadcastsd 456(%rsi), %ymm9
-; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm9[4,5,6,7]
; AVX-NEXT: vmovaps 448(%rdx), %xmm10
; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
-; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7]
-; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,3,2,3]
-; AVX-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm1[0],mem[0],ymm1[2],mem[2]
-; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3],ymm10[4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5],ymm10[6,7]
+; AVX-NEXT: vpermilps {{.*#+}} xmm10 = mem[2,3,2,3]
+; AVX-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm0[0],mem[0],ymm0[2],mem[2]
+; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm11[4,5,6,7]
; AVX-NEXT: vbroadcastsd 488(%rsi), %ymm10
-; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm10[2,3]
-; AVX-NEXT: vmovapd %ymm13, %ymm9
+; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm15[0,1],ymm10[2,3]
+; AVX-NEXT: vmovapd %ymm15, %ymm9
; AVX-NEXT: vmovaps 480(%rdx), %xmm10
-; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm13
-; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3]
-; AVX-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm12 = xmm12[0],mem[0]
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
-; AVX-NEXT: vmovaps (%r8), %ymm12
-; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = mem[0,1,2,3,4,5],ymm12[6,7]
-; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = ymm12[0,1],mem[2,3,4,5,6,7]
-; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = mem[0,1],ymm12[2,3],mem[4,5,6,7]
-; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7]
+; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm15
+; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3]
+; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm13 = xmm13[0],mem[0]
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
+; AVX-NEXT: vmovaps (%r8), %ymm13
+; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT: # ymm15 = mem[0,1,2,3,4,5],ymm13[6,7]
+; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT: # ymm15 = ymm13[0,1],mem[2,3,4,5,6,7]
+; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT: # ymm15 = mem[0,1],ymm13[2,3],mem[4,5,6,7]
+; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm13[4,5],ymm8[6,7]
; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3],ymm7[2,3]
; AVX-NEXT: vmovapd 48(%rdx), %xmm8
-; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm8[0],ymm6[0],ymm8[2],ymm6[3]
-; AVX-NEXT: vmovapd 48(%rsi), %xmm12
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm12[1],xmm8[1]
-; AVX-NEXT: vbroadcastsd 56(%rcx), %ymm12
-; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3]
-; AVX-NEXT: vmovapd 32(%r8), %ymm12
-; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = ymm12[0],mem[1,2,3]
-; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm13 # 32-byte Folded Reload
-; AVX-NEXT: # ymm13 = mem[0],ymm12[1],mem[2,3]
-; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm12[2],ymm6[3]
-; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm8[0,1,2],ymm12[3]
-; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm6 = xmm6[0],mem[0]
-; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7]
+; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[2],ymm7[3]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm8 = mem[0],xmm8[1]
+; AVX-NEXT: vbroadcastsd 56(%rcx), %ymm13
+; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3]
+; AVX-NEXT: vmovapd 32(%r8), %ymm13
+; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT: # ymm15 = ymm13[0],mem[1,2,3]
+; AVX-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload
+; AVX-NEXT: # ymm15 = mem[0],ymm13[1],mem[2,3]
+; AVX-NEXT: vmovupd %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm13[2],ymm7[3]
+; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1,2],ymm13[3]
+; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm7 = xmm7[0],mem[0]
+; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
; AVX-NEXT: vmovaps 64(%r8), %ymm7
; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm8 # 32-byte Folded Reload
; AVX-NEXT: # ymm8 = mem[0,1,2,3,4,5],ymm7[6,7]
@@ -7712,17 +7704,16 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vperm2f128 {{.*#+}} ymm6 = mem[2,3],ymm6[2,3]
; AVX-NEXT: vmovapd 112(%rdx), %xmm7
; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[3]
-; AVX-NEXT: vmovapd 112(%rsi), %xmm8
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm7[1]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm7 = mem[0],xmm7[1]
; AVX-NEXT: vbroadcastsd 120(%rcx), %ymm8
; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3]
; AVX-NEXT: vmovapd 96(%r8), %ymm8
-; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = mem[0],ymm8[1],mem[2,3]
-; AVX-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm12 # 32-byte Folded Reload
-; AVX-NEXT: # ymm12 = ymm8[0],mem[1,2,3]
-; AVX-NEXT: vmovupd %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
+; AVX-NEXT: # ymm13 = mem[0],ymm8[1],mem[2,3]
+; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd $14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm13 # 32-byte Folded Reload
+; AVX-NEXT: # ymm13 = ymm8[0],mem[1,2,3]
+; AVX-NEXT: vmovupd %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2],ymm6[3]
; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm7[0,1,2],ymm8[3]
@@ -7746,8 +7737,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3],ymm5[2,3]
; AVX-NEXT: vmovapd 176(%rdx), %xmm6
; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[2],ymm5[3]
-; AVX-NEXT: vmovapd 176(%rsi), %xmm7
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm7[1],xmm6[1]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm6 = mem[0],xmm6[1]
; AVX-NEXT: vbroadcastsd 184(%rcx), %ymm7
; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3]
; AVX-NEXT: vmovapd 160(%r8), %ymm7
@@ -7780,8 +7770,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3]
; AVX-NEXT: vmovapd 240(%rdx), %xmm5
; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[0],ymm3[0],ymm5[2],ymm3[3]
-; AVX-NEXT: vmovapd 240(%rsi), %xmm6
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm6[1],xmm5[1]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm5 = mem[0],xmm5[1]
; AVX-NEXT: vbroadcastsd 248(%rcx), %ymm6
; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3]
; AVX-NEXT: vmovapd 224(%r8), %ymm6
@@ -7810,11 +7799,11 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7]
; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm14[2,3]
+; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3],ymm3[2,3]
; AVX-NEXT: vmovapd 304(%rdx), %xmm4
; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[2],ymm3[3]
-; AVX-NEXT: vmovapd 304(%rsi), %xmm5
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm5[1],xmm4[1]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm4 = mem[0],xmm4[1]
; AVX-NEXT: vbroadcastsd 312(%rcx), %ymm5
; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3]
; AVX-NEXT: vmovapd 288(%r8), %ymm5
@@ -7847,8 +7836,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3],ymm2[2,3]
; AVX-NEXT: vmovapd 368(%rdx), %xmm3
; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[2],ymm2[3]
-; AVX-NEXT: vmovapd 368(%rsi), %xmm4
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm3 = mem[0],xmm3[1]
; AVX-NEXT: vbroadcastsd 376(%rcx), %ymm4
; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
; AVX-NEXT: vmovapd 352(%r8), %ymm4
@@ -7878,11 +7866,10 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3]
+; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3],ymm1[2,3]
; AVX-NEXT: vmovapd 432(%rdx), %xmm2
-; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[2],ymm0[3]
-; AVX-NEXT: vmovapd 432(%rsi), %xmm3
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm3[1],xmm2[1]
+; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[3]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm2 = mem[0],xmm2[1]
; AVX-NEXT: vbroadcastsd 440(%rcx), %ymm3
; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
; AVX-NEXT: vmovapd 416(%r8), %ymm3
@@ -7892,19 +7879,18 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vblendpd $13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
; AVX-NEXT: # ymm4 = mem[0],ymm3[1],mem[2,3]
; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm3[3]
-; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3]
+; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm3[3]
+; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
; AVX-NEXT: vmovaps 448(%r8), %ymm1
; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
; AVX-NEXT: # ymm2 = mem[0,1,2,3,4,5],ymm1[6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vblendps $252, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
-; AVX-NEXT: # ymm2 = ymm1[0,1],mem[2,3,4,5,6,7]
+; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm12[2,3,4,5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7]
; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -7913,8 +7899,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm9[2,3]
; AVX-NEXT: vmovapd 496(%rdx), %xmm1
; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[2],ymm0[3]
-; AVX-NEXT: vmovapd 496(%rsi), %xmm2
-; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
+; AVX-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1]
; AVX-NEXT: vbroadcastsd 504(%rcx), %ymm2
; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
; AVX-NEXT: vmovapd 480(%r8), %ymm2
@@ -7929,9 +7914,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm2[3]
; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm0[0],mem[0]
@@ -7951,8 +7936,8 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload
; AVX-NEXT: # xmm0 = xmm10[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm7 # 16-byte Folded Reload
-; AVX-NEXT: # xmm7 = xmm15[0],mem[0]
+; AVX-NEXT: vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT: # xmm7 = xmm14[0],mem[0]
; AVX-NEXT: vmovaps 256(%rdi), %xmm0
; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -7974,9 +7959,9 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovaps 32(%rdi), %xmm0
; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovaps 96(%rdi), %xmm0
; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0]
; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -8041,11 +8026,11 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
; AVX-NEXT: vmovaps %xmm0, 816(%r9)
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, 800(%r9)
-; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, 496(%r9)
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, 480(%r9)
-; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, 176(%r9)
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, 160(%r9)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index d25f8cf6b0bca..e2df674111de9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -1139,7 +1139,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255]
; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1],zero,xmm5[3,4,5,6],zero,xmm5[8,9,10,11],zero,xmm5[13,14,15]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5
; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero
; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
@@ -4839,162 +4839,161 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512-FCP-LABEL: store_i8_stride5_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm8
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm0
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm2
-; AVX512-FCP-NEXT: vporq %ymm0, %ymm2, %ymm20
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[u],zero,xmm1[u,10],zero,xmm1[12],zero,xmm1[u,11]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm28
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[8,u],zero,xmm1[7],zero,xmm1[9,u,11,u],zero,xmm1[10],zero,xmm1[12,u],zero
-; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm29
-; AVX512-FCP-NEXT: vporq %xmm0, %xmm2, %xmm21
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm11
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm2
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm3
+; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm21
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8],zero,xmm0[u,7],zero,xmm0[9],zero,xmm0[u],zero,xmm0[u,10],zero,xmm0[12],zero,xmm0[u,11]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm29
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[8,u],zero,xmm0[7],zero,xmm0[9,u,11,u],zero,xmm0[10],zero,xmm0[12,u],zero
+; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm30
+; AVX512-FCP-NEXT: vporq %xmm2, %xmm3, %xmm22
; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm2
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm6
-; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm7
-; AVX512-FCP-NEXT: vporq %ymm6, %ymm7, %ymm22
-; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[6],zero,xmm1[8,u],zero,xmm1[7],zero,xmm1[9],zero,xmm1[11,u],zero,xmm1[10],zero,xmm1[12]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm30
-; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[6],zero,xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[11],zero,xmm1[u,10],zero,xmm1[12],zero
-; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm31
-; AVX512-FCP-NEXT: vporq %xmm6, %xmm7, %xmm23
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm6
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm7
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm4
-; AVX512-FCP-NEXT: vporq %ymm7, %ymm4, %ymm24
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
-; AVX512-FCP-NEXT: # ymm14 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm4
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
-; AVX512-FCP-NEXT: # ymm12 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm9
-; AVX512-FCP-NEXT: vporq %ymm4, %ymm9, %ymm25
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm9
-; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm4
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm10
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm13
-; AVX512-FCP-NEXT: vporq %ymm4, %ymm13, %ymm26
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
-; AVX512-FCP-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm7
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
-; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm13
-; AVX512-FCP-NEXT: vporq %ymm7, %ymm13, %ymm27
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
-; AVX512-FCP-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm0
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
-; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm15
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm4
+; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm6
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm7
+; AVX512-FCP-NEXT: vporq %ymm4, %ymm7, %ymm23
+; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm14
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm14[6],zero,xmm14[8,u],zero,xmm14[7],zero,xmm14[9],zero,xmm14[11,u],zero,xmm14[10],zero,xmm14[12]
+; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm13
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm7
+; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm31
+; AVX512-FCP-NEXT: vporq %xmm4, %xmm7, %xmm24
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm8
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm5
+; AVX512-FCP-NEXT: vporq %ymm3, %ymm5, %ymm25
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm7
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm10
+; AVX512-FCP-NEXT: vporq %ymm7, %ymm10, %ymm26
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm7
+; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm10
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm12
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm9
+; AVX512-FCP-NEXT: vporq %ymm10, %ymm9, %ymm27
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[21],zero,zero,ymm12[20],zero,ymm12[22],zero,ymm12[24],zero,zero,ymm12[23],zero,ymm12[25],zero,zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm7[21,u],zero,ymm7[20],zero,ymm7[22],zero,ymm7[24,u],zero,ymm7[23],zero,ymm7[25,u]
+; AVX512-FCP-NEXT: vporq %ymm9, %ymm10, %ymm28
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
+; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm0
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
+; AVX512-FCP-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15
; AVX512-FCP-NEXT: vporq %ymm0, %ymm15, %ymm16
-; AVX512-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm3
-; AVX512-FCP-NEXT: vporq %ymm0, %ymm3, %ymm17
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
-; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
-; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm8
-; AVX512-FCP-NEXT: vporq %ymm3, %ymm8, %ymm19
-; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm3
-; AVX512-FCP-NEXT: vporq %ymm2, %ymm3, %ymm18
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm2
+; AVX512-FCP-NEXT: vporq %ymm0, %ymm2, %ymm17
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
+; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
+; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm5
+; AVX512-FCP-NEXT: vporq %ymm0, %ymm5, %ymm19
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[5],zero,zero,ymm1[4],zero,ymm1[6],zero,ymm1[8],zero,zero,ymm1[7],zero,ymm1[9],zero,zero,zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[3],zero,ymm11[5,u],zero,ymm11[4],zero,ymm11[6],zero,ymm11[8,u],zero,ymm11[7],zero,ymm11[9,u,19],zero,ymm11[21,u],zero,ymm11[20],zero,ymm11[22],zero,ymm11[24,u],zero,ymm11[23],zero,ymm11[25,u]
+; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm18
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm1
+; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm20
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm1
; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm3
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2
-; AVX512-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm5
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[8],zero,xmm2[u,7],zero,xmm2[9],zero,xmm2[u],zero,xmm2[u,10],zero,xmm2[12],zero,xmm2[u,11]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9,u,11,u],zero,xmm5[10],zero,xmm5[12,u],zero
-; AVX512-FCP-NEXT: vpor %xmm4, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm4
-; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm7
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm3[6],zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9],zero,xmm3[11,u],zero,xmm3[10],zero,xmm3[12]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[6],zero,xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[11],zero,xmm4[u,10],zero,xmm4[12],zero
-; AVX512-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm10
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,1,2,2,2,2,2,2]
-; AVX512-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm11
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; AVX512-FCP-NEXT: vpandn %ymm11, %ymm12, %ymm11
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm11
-; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm9
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7]
-; AVX512-FCP-NEXT: vpermd %ymm11, %ymm13, %ymm11
-; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm11
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9
-; AVX512-FCP-NEXT: vpermd %zmm7, %zmm13, %zmm7
-; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm11
-; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm13
-; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm11
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,2,3,3,8,8,9,9]
-; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm11
-; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm15
-; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11
-; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm1
-; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,1,1,8,8,9,9]
-; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm2
-; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm3
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8],zero,xmm2[u,7],zero,xmm2[9],zero,xmm2[u],zero,xmm2[u,10],zero,xmm2[12],zero,xmm2[u,11]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9,u,11,u],zero,xmm3[10],zero,xmm3[12,u],zero
+; AVX512-FCP-NEXT: vpor %xmm1, %xmm4, %xmm4
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm5
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm5[6],zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9],zero,xmm5[11,u],zero,xmm5[10],zero,xmm5[12]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm7
+; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm7
+; AVX512-FCP-NEXT: vpor %xmm1, %xmm7, %xmm7
+; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[12],zero,zero,zero,zero,ymm8[13],zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,ymm8[18],zero
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,0,5,5,5,5,0,6]
+; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm9
+; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [2,2,3,3,8,8,9,9]
+; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm11, %zmm20
+; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm9
+; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm12
+; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm9
+; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm11, %zmm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [6,6,6,0,7,7,7,7]
+; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
+; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX512-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,1,1,8,8,9,9]
+; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm2
+; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm3
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
; AVX512-FCP-NEXT: vmovdqa64 (%r8), %zmm2
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
; AVX512-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm21[0,0,1,1]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm3, %zmm3
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm23[0,0,1,1]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm5, %zmm5
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm22[0,0,1,1]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm24[0,0,1,1]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5
; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm5 & zmm12)
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm27[2,2,3,3]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm26, %zmm5
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3))
-; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,2,3,3,10,10,11,11]
-; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm17
-; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm18
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm17 ^ (zmm3 & (zmm18 ^ zmm17))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm3 & (zmm1 ^ zmm0))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm5 & mem)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm18))
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
-; AVX512-FCP-NEXT: vpermd %zmm2, %zmm0, %zmm0
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [9,9,10,10,10,10,10,10]
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [11,11,11,11,0,12,12,12]
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm7
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm5))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm26[2,2,3,3]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm25, %zmm5
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm28[2,2,3,3]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm27, %zmm7
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm6 & (zmm7 ^ zmm5))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm7 & mem)
+; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,2,3,3,10,10,11,11]
+; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm17
+; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm18
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm5 & (zmm18 ^ zmm17))
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [12,14,13,13,13,13,12,14]
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm6, %zmm6
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [14,14,14,14,15,15,15,15]
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm7
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm18))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm5 & (zmm0 ^ zmm20))
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,8,8,8,8,8,9,9]
+; AVX512-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0))
; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r9)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 256(%r9)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 128(%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 256(%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%r9)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -5175,162 +5174,161 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512DQ-FCP-LABEL: store_i8_stride5_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm8
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm2, %ymm20
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[u],zero,xmm1[u,10],zero,xmm1[12],zero,xmm1[u,11]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm28
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[8,u],zero,xmm1[7],zero,xmm1[9,u,11,u],zero,xmm1[10],zero,xmm1[12,u],zero
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm29
-; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm2, %xmm21
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm11
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm11, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm3
+; AVX512DQ-FCP-NEXT: vporq %ymm2, %ymm3, %ymm21
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8],zero,xmm0[u,7],zero,xmm0[9],zero,xmm0[u],zero,xmm0[u,10],zero,xmm0[12],zero,xmm0[u,11]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm29
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[8,u],zero,xmm0[7],zero,xmm0[9,u,11,u],zero,xmm0[10],zero,xmm0[12,u],zero
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm30
+; AVX512DQ-FCP-NEXT: vporq %xmm2, %xmm3, %xmm22
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm6
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm7
-; AVX512DQ-FCP-NEXT: vporq %ymm6, %ymm7, %ymm22
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm1[6],zero,xmm1[8,u],zero,xmm1[7],zero,xmm1[9],zero,xmm1[11,u],zero,xmm1[10],zero,xmm1[12]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm30
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm1[6],zero,xmm1[8],zero,xmm1[u,7],zero,xmm1[9],zero,xmm1[11],zero,xmm1[u,10],zero,xmm1[12],zero
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm31
-; AVX512DQ-FCP-NEXT: vporq %xmm6, %xmm7, %xmm23
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm4
-; AVX512DQ-FCP-NEXT: vporq %ymm7, %ymm4, %ymm24
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
-; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm6, %ymm4
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
-; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm1, %ymm9
-; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm9, %ymm25
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm10
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm13
-; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm13, %ymm26
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
-; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm7
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0,19,128,21,0,128,20,128,22,128,24,0,128,23,128,25,0]
-; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm13
-; AVX512DQ-FCP-NEXT: vporq %ymm7, %ymm13, %ymm27
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
-; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
-; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm15
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm2, %ymm4
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm6
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm6, %ymm7
+; AVX512DQ-FCP-NEXT: vporq %ymm4, %ymm7, %ymm23
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm14
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm14[6],zero,xmm14[8,u],zero,xmm14[7],zero,xmm14[9],zero,xmm14[11,u],zero,xmm14[10],zero,xmm14[12]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm13
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm13, %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm31
+; AVX512DQ-FCP-NEXT: vporq %xmm4, %xmm7, %xmm24
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm5
+; AVX512DQ-FCP-NEXT: vporq %ymm3, %ymm5, %ymm25
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
+; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm7
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm10
+; AVX512DQ-FCP-NEXT: vporq %ymm7, %ymm10, %ymm26
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm7, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm9
+; AVX512DQ-FCP-NEXT: vporq %ymm10, %ymm9, %ymm27
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[21],zero,zero,ymm12[20],zero,ymm12[22],zero,ymm12[24],zero,zero,ymm12[23],zero,ymm12[25],zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm7[21,u],zero,ymm7[20],zero,ymm7[22],zero,ymm7[24,u],zero,ymm7[23],zero,ymm7[25,u]
+; AVX512DQ-FCP-NEXT: vporq %ymm9, %ymm10, %ymm28
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
+; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm0
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
+; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm15
; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm15, %ymm16
-; AVX512DQ-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm3
-; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm3, %ymm17
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
-; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0,25,128,27,0,128,26,128,28,128,30,0,128,29,128,31,0]
-; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm8
-; AVX512DQ-FCP-NEXT: vporq %ymm3, %ymm8, %ymm19
-; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm5, %ymm3
-; AVX512DQ-FCP-NEXT: vporq %ymm2, %ymm3, %ymm18
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm2
+; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm2, %ymm17
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
+; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30,27,0,128,26,128,28,0,128,0,128,29,128,31,0,128,30]
+; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm5
+; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm5, %ymm19
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm1[5],zero,zero,ymm1[4],zero,ymm1[6],zero,ymm1[8],zero,zero,ymm1[7],zero,ymm1[9],zero,zero,zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm11[3],zero,ymm11[5,u],zero,ymm11[4],zero,ymm11[6],zero,ymm11[8,u],zero,ymm11[7],zero,ymm11[9,u,19],zero,ymm11[21,u],zero,ymm11[20],zero,ymm11[22],zero,ymm11[24,u],zero,ymm11[23],zero,ymm11[25,u]
+; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm1, %ymm18
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm1, %ymm20
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm1
; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm10, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm3
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2
-; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[8],zero,xmm2[u,7],zero,xmm2[9],zero,xmm2[u],zero,xmm2[u,10],zero,xmm2[12],zero,xmm2[u,11]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9,u,11,u],zero,xmm5[10],zero,xmm5[12,u],zero
-; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm7
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm3[6],zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9],zero,xmm3[11,u],zero,xmm3[10],zero,xmm3[12]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm4[6],zero,xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[11],zero,xmm4[u,10],zero,xmm4[12],zero
-; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm9, %xmm8
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm10
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [1,1,2,2,2,2,2,2]
-; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm11, %ymm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; AVX512DQ-FCP-NEXT: vpandn %ymm11, %ymm12, %ymm11
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm11, %zmm10
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm9
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm13 = [4,0,5,5,5,5,0,6,6,6,6,0,7,7,7,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm11, %ymm13, %ymm11
-; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm11
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm9, %zmm9
-; AVX512DQ-FCP-NEXT: vpermd %zmm7, %zmm13, %zmm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm13
-; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm13 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm11, %xmm11
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm14 = [2,2,3,3,8,8,9,9]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm11
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm15
-; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm11, %xmm11
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm14, %zmm1
-; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm13, %xmm3, %xmm3
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm4 = [0,0,1,1,8,8,9,9]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm2
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm3
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8],zero,xmm2[u,7],zero,xmm2[9],zero,xmm2[u],zero,xmm2[u,10],zero,xmm2[12],zero,xmm2[u,11]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm3[8,u],zero,xmm3[7],zero,xmm3[9,u,11,u],zero,xmm3[10],zero,xmm3[12,u],zero
+; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm4, %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm5[6],zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9],zero,xmm5[11,u],zero,xmm5[10],zero,xmm5[12]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm7
+; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm7
+; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm7, %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[12],zero,zero,zero,zero,ymm8[13],zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,ymm8[18],zero
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [4,0,5,5,5,5,0,6]
+; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm11 = [2,2,3,3,8,8,9,9]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm11, %zmm20
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm12
+; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm9
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm11, %zmm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [6,6,6,0,7,7,7,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm12, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [0,0,1,1,8,8,9,9]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm5, %zmm3
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
; AVX512DQ-FCP-NEXT: vmovdqa64 (%r8), %zmm2
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm4, %zmm4
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm21[0,0,1,1]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm3, %zmm3
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm23[0,0,1,1]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm5, %zmm5
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm22[0,0,1,1]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm24[0,0,1,1]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm5, %zmm5
; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm6 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm5 & zmm12)
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm25[2,2,3,3]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm24, %zmm3
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm27[2,2,3,3]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm26, %zmm5
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm3 ^ (zmm6 & (zmm5 ^ zmm3))
-; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm3 = [2,2,3,3,10,10,11,11]
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm3, %zmm17
-; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm3, %zmm18
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm17 ^ (zmm3 & (zmm18 ^ zmm17))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm3 & (zmm1 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm5 & mem)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm18))
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [6,6,6,0,7,7,7,7,0,8,8,8,8,0,9,9]
-; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm0, %zmm0
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [9,9,10,10,10,10,10,10]
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [11,11,11,11,0,12,12,12]
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm5))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm26[2,2,3,3]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm25, %zmm5
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm28[2,2,3,3]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm27, %zmm7
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm6 & (zmm7 ^ zmm5))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm7 & mem)
+; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} zmm5 = [2,2,3,3,10,10,11,11]
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm5, %zmm17
+; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm18
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm5 = [0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (zmm5 & (zmm18 ^ zmm17))
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [12,14,13,13,13,13,12,14]
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [14,14,14,14,15,15,15,15]
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm7
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm18))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm20 ^ (zmm5 & (zmm0 ^ zmm20))
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [8,8,8,8,8,8,9,9]
+; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm5, %zmm2
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm8, %zmm2
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm0))
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 256(%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 192(%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 128(%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 256(%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -5423,7 +5421,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
; AVX512BW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %ymm23, %ymm28, %ymm29
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
+; AVX512BW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %ymm24, %ymm27, %ymm30
; AVX512BW-NEXT: vporq %ymm29, %ymm30, %ymm29
; AVX512BW-NEXT: vpshufb %ymm18, %ymm28, %ymm18
@@ -5432,7 +5431,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vmovdqu8 %ymm27, %ymm18 {%k5}
; AVX512BW-NEXT: vinserti64x4 $1, %ymm18, %zmm29, %zmm18
; AVX512BW-NEXT: vpermq {{.*#+}} zmm18 = zmm18[2,2,3,3,6,6,7,7]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512BW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
; AVX512BW-NEXT: vpshufb %ymm27, %ymm25, %ymm28
; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
; AVX512BW-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3]
@@ -5504,12 +5504,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-LABEL: store_i8_stride5_vf64:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0
-; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm8
+; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm21
; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm21
+; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm21, %ymm1
+; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm8
; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm21, %ymm2
+; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm2
; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm11
; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm2
@@ -5517,77 +5517,82 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm3
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm12
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm4
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
-; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm4, %xmm5
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm5
; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm13
; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
-; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm3, %xmm9
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm17
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
+; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm3, %xmm9
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm16
; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm5
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
-; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm5, %xmm10
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
+; AVX512BW-FCP-NEXT: vpshufb %xmm18, %xmm5, %xmm10
; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1]
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm18
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm9[0,0,1,1]
+; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm20
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
-; AVX512BW-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm22
+; AVX512BW-FCP-NEXT: vpermd %ymm20, %ymm9, %ymm22
; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm23
; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
; AVX512BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm23, %ymm22 {%k1}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm15, %zmm15
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm19, %zmm19
; AVX512BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
; AVX512BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12]
-; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm15, %zmm15
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm1 {%k2}
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [9,9,10,10,10,10,10,10]
+; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm19
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [11,11,11,11,0,12,12,12]
+; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm22
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm19, %zmm19
; AVX512BW-FCP-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
; AVX512BW-FCP-NEXT: kmovq %rax, %k3
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm1 {%k3}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm1 {%k3}
; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
-; AVX512BW-FCP-NEXT: vpshufb %zmm15, %zmm22, %zmm22
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm18[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
-; AVX512BW-FCP-NEXT: vpshufb %zmm18, %zmm23, %zmm23
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
+; AVX512BW-FCP-NEXT: vpshufb %zmm19, %zmm22, %zmm22
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm20[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
+; AVX512BW-FCP-NEXT: vpshufb %zmm20, %zmm23, %zmm23
; AVX512BW-FCP-NEXT: vporq %zmm22, %zmm23, %zmm22
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7]
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
-; AVX512BW-FCP-NEXT: vpshufb %zmm23, %zmm8, %zmm8
; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
-; AVX512BW-FCP-NEXT: vpshufb %zmm24, %zmm21, %zmm21
-; AVX512BW-FCP-NEXT: vporq %zmm8, %zmm21, %zmm8
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
+; AVX512BW-FCP-NEXT: vpshufb %zmm23, %zmm21, %zmm21
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
+; AVX512BW-FCP-NEXT: vpshufb %zmm24, %zmm8, %zmm8
+; AVX512BW-FCP-NEXT: vporq %zmm21, %zmm8, %zmm8
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7]
; AVX512BW-FCP-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
; AVX512BW-FCP-NEXT: kmovq %rax, %k3
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm8 {%k3}
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm21
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7]
-; AVX512BW-FCP-NEXT: vpermd %zmm21, %zmm22, %zmm21
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [12,14,13,13,13,13,12,14]
+; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm21
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [14,14,14,14,15,15,15,15]
+; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm22
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm21
; AVX512BW-FCP-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
; AVX512BW-FCP-NEXT: kmovq %rax, %k4
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm21, %zmm8 {%k4}
; AVX512BW-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm14
-; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm12, %xmm16
-; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14
+; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm15
+; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
; AVX512BW-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm11, %zmm11
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5]
-; AVX512BW-FCP-NEXT: vpshufb %xmm19, %xmm13, %xmm11
-; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm17, %xmm16
-; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm16, %xmm11
-; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm17[0],xmm13[1],xmm17[1],xmm13[2],xmm17[2],xmm13[3],xmm17[3],xmm13[4],xmm17[4],xmm13[5],xmm17[5],xmm13[6],xmm17[6],xmm13[7],xmm17[7]
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
-; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT: vpshufb %xmm17, %xmm13, %xmm11
+; AVX512BW-FCP-NEXT: vpshufb %xmm18, %xmm16, %xmm15
+; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm15, %xmm11
+; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm16[0],xmm13[1],xmm16[1],xmm13[2],xmm16[2],xmm13[3],xmm16[3],xmm13[4],xmm16[4],xmm13[5],xmm16[5],xmm13[6],xmm16[6],xmm13[7],xmm16[7]
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13
; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm13, %zmm11
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5]
; AVX512BW-FCP-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
@@ -5604,19 +5609,19 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm7
; AVX512BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6
; AVX512BW-FCP-NEXT: vpshufb %ymm23, %ymm13, %ymm7
-; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm14, %ymm17
-; AVX512BW-FCP-NEXT: vporq %ymm7, %ymm17, %ymm7
+; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm14, %ymm16
+; AVX512BW-FCP-NEXT: vporq %ymm7, %ymm16, %ymm7
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm7
-; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm15
+; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm16
; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm17
-; AVX512BW-FCP-NEXT: vpshufb %ymm18, %ymm17, %ymm18
-; AVX512BW-FCP-NEXT: vporq %ymm15, %ymm18, %ymm15
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm17, %ymm18
+; AVX512BW-FCP-NEXT: vporq %ymm16, %ymm18, %ymm16
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3]
; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm9
; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm9 {%k1}
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm9
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm9, %zmm9
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k2}
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6]
; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm6, %zmm6
@@ -5631,14 +5636,17 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9]
; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm4
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm16, %xmm2, %xmm2
+; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm17[26],zero,ymm17[28],zero,zero,ymm17[27],zero,ymm17[29],zero,ymm17[31],zero,zero,ymm17[30],zero
; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3
; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm3
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k3}
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9]
-; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [6,6,6,6,7,7,7,7]
+; AVX512BW-FCP-NEXT: vpermd (%r8), %ymm2, %ymm2
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [8,8,8,8,8,8,9,9]
+; AVX512BW-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm0
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512BW-FCP-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
; AVX512BW-FCP-NEXT: kmovq %rax, %k1
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1}
@@ -5739,7 +5747,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm23 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
; AVX512DQ-BW-NEXT: # ymm23 = mem[0,1,2,3,0,1,2,3]
; AVX512DQ-BW-NEXT: vpshufb %ymm23, %ymm28, %ymm29
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
+; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128]
+; AVX512DQ-BW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm27, %ymm30
; AVX512DQ-BW-NEXT: vporq %ymm29, %ymm30, %ymm29
; AVX512DQ-BW-NEXT: vpshufb %ymm18, %ymm28, %ymm18
@@ -5748,7 +5757,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vmovdqu8 %ymm27, %ymm18 {%k5}
; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm18, %zmm29, %zmm18
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm18 = zmm18[2,2,3,3,6,6,7,7]
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm27 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25]
+; AVX512DQ-BW-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
; AVX512DQ-BW-NEXT: vpshufb %ymm27, %ymm25, %ymm28
; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
; AVX512DQ-BW-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3]
@@ -5820,12 +5830,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf64:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm21
; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,12,13,128,128,128,128,14,128,128,128,14,15,128,128,128,128,16,128,128,128,16,17,128,128,128,128,18,128,128,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm21
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm21, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm8
; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm21, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm8, %ymm2
; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm11
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm2
@@ -5833,77 +5843,82 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm2, %xmm3
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm12
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm4
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm4, %xmm5
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm4, %xmm5
; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm5, %xmm3
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm13
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm3, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm17 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm3, %xmm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm16
; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm5, %xmm10
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm18, %xmm5, %xmm10
; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm9[0,0,1,1]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm18
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm9[0,0,1,1]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm20
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm22
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm20, %ymm9, %ymm22
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm23
; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm10 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
; AVX512DQ-BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm23, %ymm22 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm15, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm19, %zmm19
; AVX512DQ-BW-FCP-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm15 = [9,9,10,10,10,10,10,10,11,11,11,11,0,12,12,12]
-; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm15, %zmm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm1 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [9,9,10,10,10,10,10,10]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm19
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [11,11,11,11,0,12,12,12]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm19, %zmm19
; AVX512DQ-BW-FCP-NEXT: movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm1 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm1 {%k3}
; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm15, %zmm22, %zmm22
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm18[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm18, %zmm23, %zmm23
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm19, %zmm22, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm20[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm20, %zmm23, %zmm23
; AVX512DQ-BW-FCP-NEXT: vporq %zmm22, %zmm23, %zmm22
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7]
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm23, %zmm8, %zmm8
; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm24, %zmm21, %zmm21
-; AVX512DQ-BW-FCP-NEXT: vporq %zmm8, %zmm21, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm23 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm23, %zmm21, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm24, %zmm8, %zmm8
+; AVX512DQ-BW-FCP-NEXT: vporq %zmm21, %zmm8, %zmm8
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm8 = zmm8[2,2,3,3,6,6,7,7]
; AVX512DQ-BW-FCP-NEXT: movabsq $1785168781326730801, %rax # imm = 0x18C6318C6318C631
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm8 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm21
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [4,6,5,5,5,5,4,6,6,6,6,6,7,7,7,7]
-; AVX512DQ-BW-FCP-NEXT: vpermd %zmm21, %zmm22, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [12,14,13,13,13,13,12,14]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm21
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [14,14,14,14,15,15,15,15]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm21, %zmm21
; AVX512DQ-BW-FCP-NEXT: movabsq $-8925843906633654008, %rax # imm = 0x8421084210842108
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k4
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm21, %zmm8 {%k4}
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm14, %xmm11, %xmm14
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm12, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm12, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm11
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm11, %zmm11
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm14 = zmm11[0,0,1,1,4,4,5,5]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm19, %xmm13, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm17, %xmm16
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm16, %xmm11
-; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm17[0],xmm13[1],xmm17[1],xmm13[2],xmm17[2],xmm13[3],xmm17[3],xmm13[4],xmm17[4],xmm13[5],xmm17[5],xmm13[6],xmm17[6],xmm13[7],xmm17[7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm16 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm17, %xmm13, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm18, %xmm16, %xmm15
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm15, %xmm11
+; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm13[0],xmm16[0],xmm13[1],xmm16[1],xmm13[2],xmm16[2],xmm13[3],xmm16[3],xmm13[4],xmm16[4],xmm13[5],xmm16[5],xmm13[6],xmm16[6],xmm13[7],xmm16[7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13
; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm11, %zmm13, %zmm11
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm11 = zmm11[0,0,1,1,4,4,5,5]
; AVX512DQ-BW-FCP-NEXT: movabsq $-4165393823095705204, %rax # imm = 0xC6318C6318C6318C
@@ -5920,19 +5935,19 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm14, %ymm7
; AVX512DQ-BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm23, %ymm13, %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm14, %ymm17
-; AVX512DQ-BW-FCP-NEXT: vporq %ymm7, %ymm17, %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm14, %ymm16
+; AVX512DQ-BW-FCP-NEXT: vporq %ymm7, %ymm16, %ymm7
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm7
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm7, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm16
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm17
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm18, %ymm17, %ymm18
-; AVX512DQ-BW-FCP-NEXT: vporq %ymm15, %ymm18, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm17, %ymm18
+; AVX512DQ-BW-FCP-NEXT: vporq %ymm16, %ymm18, %ymm16
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,2,3,3]
; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm9
; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm9 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm9, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm9, %zmm9
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm9 {%k2}
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [3,3,3,3,0,4,4,4,4,6,5,5,5,5,4,6]
; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm6, %zmm6
@@ -5947,14 +5962,17 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9]
; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm4
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm16, %xmm2, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm17[26],zero,ymm17[28],zero,zero,ymm17[27],zero,ymm17[29],zero,ymm17[31],zero,zero,ymm17[30],zero
; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3
; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm3
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [6,6,6,6,7,7,7,7,8,8,8,8,8,8,9,9]
-; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm2, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [6,6,6,6,7,7,7,7]
+; AVX512DQ-BW-FCP-NEXT: vpermd (%r8), %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [8,8,8,8,8,8,9,9]
+; AVX512DQ-BW-FCP-NEXT: vpermd %zmm0, %zmm4, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512DQ-BW-FCP-NEXT: movabsq $2380225041768974402, %rax # imm = 0x2108421084210842
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index 6205be83f5123..a08bb851c603d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -2014,13 +2014,13 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa 16(%rsi), %xmm0
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[3,3,3,3]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3]
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm2
+; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX-NEXT: vandnps %ymm1, %ymm9, %ymm2
; AVX-NEXT: vmovdqa 16(%rcx), %xmm1
; AVX-NEXT: vmovdqa 16(%rdx), %xmm3
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
@@ -2030,47 +2030,47 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[1,0,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX-NEXT: vandps %ymm3, %ymm10, %ymm3
+; AVX-NEXT: vandps %ymm3, %ymm9, %ymm3
; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX-NEXT: vmovdqa 16(%r8), %xmm12
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm12[8,u],zero,zero,zero,zero,xmm12[9,u],zero,zero,zero,zero
+; AVX-NEXT: vmovdqa 16(%r8), %xmm11
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm11[8,u],zero,zero,zero,zero,xmm11[9,u],zero,zero,zero,zero
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4],xmm4[5],xmm3[6,7]
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4],zero,xmm3[6,7,8,9,10],zero,xmm3[12,13,14,15]
; AVX-NEXT: vmovdqa 16(%r9), %xmm13
; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm13[8],zero,zero,zero,zero,zero,xmm13[9],zero,zero,zero,zero
; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm12[5,u],zero,zero,zero,zero,xmm12[6,u],zero,zero,zero,zero,xmm12[7,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm11[5,u],zero,zero,zero,zero,xmm11[6,u],zero,zero,zero,zero,xmm11[7,u]
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6],xmm3[7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128]
-; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm2
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm13[5],zero,zero,zero,zero,zero,xmm13[6],zero,zero,zero,zero,zero,xmm13[7]
; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,2]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX-NEXT: vandps %ymm0, %ymm10, %ymm0
+; AVX-NEXT: vandps %ymm0, %ymm9, %ymm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,4,4]
; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm1
+; AVX-NEXT: vandnps %ymm1, %ymm9, %ymm1
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[10,u],zero,zero,zero,zero,xmm12[11,u],zero,zero,zero,zero,xmm12[12,u],zero,zero
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm11[10,u],zero,zero,zero,zero,xmm11[11,u],zero,zero,zero,zero,xmm11[12,u],zero,zero
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm13[10],zero,zero,zero,zero,zero,xmm13[11],zero,zero,zero,zero,zero,xmm13[12],zero,zero
; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u]
-; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm1
+; AVX-NEXT: vpshufb %xmm2, %xmm11, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm13[13],zero,zero,zero,zero,zero,xmm13[14],zero,zero,zero,zero,zero,xmm13[15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8],zero,xmm0[10,11,12,13,14],zero
+; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15]
+; AVX-NEXT: vpshufb %xmm10, %xmm13, %xmm1
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa (%rsi), %xmm8
@@ -2087,51 +2087,53 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm3[0,1,2,3,5,6,7,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[2,2,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm1, %ymm1
-; AVX-NEXT: vandps %ymm0, %ymm10, %ymm0
-; AVX-NEXT: vandnps %ymm1, %ymm10, %ymm1
+; AVX-NEXT: vandps %ymm0, %ymm9, %ymm0
+; AVX-NEXT: vandnps %ymm1, %ymm9, %ymm1
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm15
; AVX-NEXT: vmovdqa (%r8), %xmm1
; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm0
; AVX-NEXT: vextractf128 $1, %ymm15, %xmm2
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3],xmm0[4],xmm2[5,6],xmm0[7]
-; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8],zero,xmm0[10,11,12,13,14],zero
; AVX-NEXT: vmovdqa (%r9), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm0[13],zero,zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,zero,xmm0[15]
+; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm9
; AVX-NEXT: vpor %xmm2, %xmm9, %xmm2
; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm15[1,2],xmm2[3],xmm15[4,5],xmm2[6],xmm15[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5,6],zero,xmm2[8,9,10,11,12],zero,xmm2[14,15]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero
; AVX-NEXT: vpor %xmm2, %xmm9, %xmm2
; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,0,1,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[1,1,2,2]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,0,1,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[1,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm9
; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[1,0,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm14[3,3,3,3,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,4,4]
-; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm11
+; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm14[3,3,3,3,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4]
+; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm10
; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
; AVX-NEXT: vandps %ymm2, %ymm9, %ymm9
-; AVX-NEXT: vandnps %ymm11, %ymm2, %ymm11
-; AVX-NEXT: vorps %ymm11, %ymm9, %ymm15
-; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm12[0,u],zero,zero,zero,zero,xmm12[1,u],zero,zero,zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm15[0,1],xmm11[2],xmm15[3,4],xmm11[5],xmm15[6,7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm14, %xmm11, %xmm11
-; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero
-; AVX-NEXT: vpor %xmm10, %xmm11, %xmm11
-; AVX-NEXT: vextractf128 $1, %ymm15, %xmm10
-; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128]
-; AVX-NEXT: vpshufb %xmm15, %xmm12, %xmm12
-; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1,2],xmm12[3],xmm10[4,5],xmm12[6],xmm10[7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15]
-; AVX-NEXT: vpshufb %xmm9, %xmm10, %xmm10
-; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128]
-; AVX-NEXT: vpshufb %xmm12, %xmm13, %xmm13
-; AVX-NEXT: vpor %xmm13, %xmm10, %xmm10
+; AVX-NEXT: vandnps %ymm10, %ymm2, %ymm10
+; AVX-NEXT: vorps %ymm10, %ymm9, %ymm9
+; AVX-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,zero,xmm11[0,u],zero,zero,zero,zero,xmm11[1,u],zero,zero,zero,zero
+; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm9[0,1],xmm12[2],xmm9[3,4],xmm12[5],xmm9[6,7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX-NEXT: vpand %xmm14, %xmm12, %xmm12
+; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128]
+; AVX-NEXT: vpshufb %xmm15, %xmm13, %xmm10
+; AVX-NEXT: vpor %xmm10, %xmm12, %xmm10
+; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm9, %xmm9
+; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128]
+; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm11
+; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1,2],xmm11[3],xmm9[4,5],xmm11[6],xmm9[7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15]
+; AVX-NEXT: vpshufb %xmm11, %xmm9, %xmm9
+; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128]
+; AVX-NEXT: vpshufb %xmm10, %xmm13, %xmm13
+; AVX-NEXT: vpor %xmm13, %xmm9, %xmm9
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,0,1,1]
; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[1,1,2,2]
@@ -2145,16 +2147,16 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vandps %ymm2, %ymm8, %ymm8
; AVX-NEXT: vandnps %ymm7, %ymm2, %ymm2
; AVX-NEXT: vorps %ymm2, %ymm8, %ymm7
-; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm2
+; AVX-NEXT: vpshufb %xmm12, %xmm1, %xmm2
; AVX-NEXT: vextractf128 $1, %ymm7, %xmm8
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm8[1,2],xmm2[3],xmm8[4,5],xmm2[6],xmm8[7]
-; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm2
-; AVX-NEXT: vpshufb %xmm12, %xmm0, %xmm8
+; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm2
+; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm8
; AVX-NEXT: vpor %xmm2, %xmm8, %xmm2
; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,xmm1[0,u],zero,zero,zero,zero,xmm1[1,u],zero,zero,zero,zero
; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3,4],xmm8[5],xmm7[6,7]
-; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm7
-; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero
+; AVX-NEXT: vpand %xmm7, %xmm14, %xmm7
+; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm8
; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7
; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,3,3,3]
; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,0,1,1]
@@ -2170,13 +2172,13 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3
; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u]
; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm3[0],xmm4[1],xmm3[2,3],xmm4[4],xmm3[5,6],xmm4[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7,8],zero,xmm4[10,11,12,13,14],zero
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7]
; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero
; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
-; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4],zero,xmm1[6,7,8,9,10],zero,xmm1[12,13,14,15]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
@@ -2184,8 +2186,9 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vmovdqa %xmm4, 32(%rax)
; AVX-NEXT: vmovdqa %xmm7, (%rax)
; AVX-NEXT: vmovdqa %xmm2, 16(%rax)
-; AVX-NEXT: vmovdqa %xmm10, 112(%rax)
-; AVX-NEXT: vmovdqa %xmm11, 96(%rax)
+; AVX-NEXT: vmovdqa %xmm9, 112(%rax)
+; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vmovaps %xmm0, 96(%rax)
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, 64(%rax)
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -3956,7 +3959,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX-LABEL: store_i8_stride6_vf64:
; AVX: # %bb.0:
-; AVX-NEXT: subq $200, %rsp
+; AVX-NEXT: subq $184, %rsp
; AVX-NEXT: vmovdqa 48(%rsi), %xmm1
; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -3971,174 +3974,172 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,5,6,7,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[2,2,2,3]
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[1,0,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
-; AVX-NEXT: vandps %ymm5, %ymm10, %ymm5
-; AVX-NEXT: vorps %ymm2, %ymm5, %ymm6
-; AVX-NEXT: vextractf128 $1, %ymm6, %xmm5
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[1,0,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
+; AVX-NEXT: vandps %ymm4, %ymm10, %ymm4
+; AVX-NEXT: vorps %ymm2, %ymm4, %ymm6
+; AVX-NEXT: vextractf128 $1, %ymm6, %xmm4
; AVX-NEXT: vmovdqa 48(%r8), %xmm2
; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,xmm2[8,u],zero,zero,zero,zero,xmm2[9,u],zero,zero,zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2],xmm5[3,4],xmm7[5],xmm5[6,7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm5[0,1,2,3,4],zero,xmm5[6,7,8,9,10],zero,xmm5[12,13,14,15]
-; AVX-NEXT: vmovdqa 48(%r9), %xmm5
-; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm5[8],zero,zero,zero,zero,zero,xmm5[9],zero,zero,zero,zero
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2],xmm4[3,4],xmm7[5],xmm4[6,7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10],zero,xmm4[12,13,14,15]
+; AVX-NEXT: vmovdqa 48(%r9), %xmm4
+; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,zero,zero,zero,zero,xmm4[8],zero,zero,zero,zero,zero,xmm4[9],zero,zero,zero,zero
; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7
; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm2[5,u],zero,zero,zero,zero,xmm2[6,u],zero,zero,zero,zero,xmm2[7,u]
; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3],xmm7[4],xmm6[5,6],xmm7[7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128]
-; AVX-NEXT: vpshufb %xmm8, %xmm6, %xmm6
-; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm5[5],zero,zero,zero,zero,zero,xmm5[6],zero,zero,zero,zero,zero,xmm5[7]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
+; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm4[5],zero,zero,zero,zero,zero,xmm4[6],zero,zero,zero,zero,zero,xmm4[7]
; AVX-NEXT: vpor %xmm7, %xmm6, %xmm6
; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,2,2]
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
; AVX-NEXT: vandps %ymm1, %ymm10, %ymm1
-; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[3,3,3,3,4,5,6,7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[3,3,3,3,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4
-; AVX-NEXT: vandnps %ymm4, %ymm10, %ymm4
-; AVX-NEXT: vorps %ymm4, %ymm1, %ymm1
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u]
-; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm6
-; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6],xmm6[7]
-; AVX-NEXT: vpshufb %xmm8, %xmm4, %xmm4
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm5[13],zero,zero,zero,zero,zero,xmm5[14],zero,zero,zero,zero,zero,xmm5[15]
-; AVX-NEXT: vpor %xmm6, %xmm4, %xmm4
-; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1,2],xmm4[3],xmm1[4,5],xmm4[6],xmm1[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2,3,4,5,6],zero,xmm1[8,9,10,11,12],zero,xmm1[14,15]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128]
-; AVX-NEXT: vpshufb %xmm13, %xmm5, %xmm4
-; AVX-NEXT: vpor %xmm4, %xmm1, %xmm1
+; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,2,2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5
+; AVX-NEXT: vandnps %ymm5, %ymm10, %ymm5
+; AVX-NEXT: vorps %ymm5, %ymm1, %ymm1
+; AVX-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[13,u],zero,zero,zero,zero,xmm2[14,u],zero,zero,zero,zero,xmm2[15,u]
+; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3],xmm6[4],xmm5[5,6],xmm6[7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128]
+; AVX-NEXT: vpshufb %xmm15, %xmm5, %xmm5
+; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm4[13],zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,xmm4[15]
+; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5
+; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm2[10,u],zero,zero,zero,zero,xmm2[11,u],zero,zero,zero,zero,xmm2[12,u],zero,zero
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1,2],xmm5[3],xmm1[4,5],xmm5[6],xmm1[7]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,10,128,128,128,128,128,11,128,128,128,128,128,12,128,128]
+; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm5
+; AVX-NEXT: vpor %xmm5, %xmm1, %xmm1
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 32(%rsi), %xmm6
; AVX-NEXT: vmovdqa 32(%rdi), %xmm11
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15]
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[1,1,2,2]
-; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[3,3,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm6[8],xmm11[9],xmm6[9],xmm11[10],xmm6[10],xmm11[11],xmm6[11],xmm11[12],xmm6[12],xmm11[13],xmm6[13],xmm11[14],xmm6[14],xmm11[15],xmm6[15]
+; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,2]
+; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[3,3,3,3]
+; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
; AVX-NEXT: vmovdqa 32(%rcx), %xmm12
-; AVX-NEXT: vmovdqa 32(%rdx), %xmm14
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm4[0,1,2,3,5,6,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,2,2,3]
-; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8
-; AVX-NEXT: vandps %ymm1, %ymm10, %ymm1
-; AVX-NEXT: vandnps %ymm8, %ymm10, %ymm8
-; AVX-NEXT: vorps %ymm1, %ymm8, %ymm9
-; AVX-NEXT: vmovdqa 32(%r8), %xmm1
-; AVX-NEXT: vextractf128 $1, %ymm9, %xmm8
-; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm10
-; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3],xmm10[4],xmm8[5,6],xmm10[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[0,1,2],zero,xmm8[4,5,6,7,8],zero,xmm8[10,11,12,13,14],zero
-; AVX-NEXT: vmovdqa 32(%r9), %xmm8
-; AVX-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm8[13],zero,zero,zero,zero,zero,xmm8[14],zero,zero,zero,zero,zero,xmm8[15]
+; AVX-NEXT: vmovdqa 32(%rdx), %xmm13
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm9[3,3,3,3,4,5,6,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,4,4]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,5,6,7,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,2,2,3]
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
+; AVX-NEXT: vandps %ymm5, %ymm10, %ymm5
+; AVX-NEXT: vandnps %ymm7, %ymm10, %ymm7
+; AVX-NEXT: vorps %ymm7, %ymm5, %ymm8
+; AVX-NEXT: vmovdqa 32(%r8), %xmm5
+; AVX-NEXT: vextractf128 $1, %ymm8, %xmm7
+; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,xmm5[13,u],zero,zero,zero,zero,xmm5[14,u],zero,zero,zero,zero,xmm5[15,u]
+; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6],xmm10[7]
+; AVX-NEXT: vpshufb %xmm15, %xmm7, %xmm10
+; AVX-NEXT: vmovdqa 32(%r9), %xmm7
+; AVX-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm7[13],zero,zero,zero,zero,zero,xmm7[14],zero,zero,zero,zero,zero,xmm7[15]
; AVX-NEXT: vpor %xmm15, %xmm10, %xmm10
; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5],xmm10[6],xmm9[7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15]
-; AVX-NEXT: vpshufb %xmm15, %xmm9, %xmm9
-; AVX-NEXT: vpshufb %xmm13, %xmm8, %xmm10
-; AVX-NEXT: vpor %xmm10, %xmm9, %xmm9
-; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,0,1,1]
+; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm5[10,u],zero,zero,zero,zero,xmm5[11,u],zero,zero,zero,zero,xmm5[12,u],zero,zero
+; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0],xmm8[1,2],xmm10[3],xmm8[4,5],xmm10[6],xmm8[7]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm8, %xmm8
+; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm10
+; AVX-NEXT: vpor %xmm10, %xmm8, %xmm8
+; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,0,1,1]
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,2]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm9, %ymm0
-; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm3[1,0,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[0,1,0,1]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0
+; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[1,0,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,1]
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,3,3,3,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,4,4]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3
-; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX-NEXT: vandps %ymm0, %ymm9, %ymm0
-; AVX-NEXT: vandnps %ymm3, %ymm9, %ymm3
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm8, %ymm3
+; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX-NEXT: vandps %ymm0, %ymm8, %ymm0
+; AVX-NEXT: vandnps %ymm3, %ymm8, %ymm3
; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[2,u],zero,zero,zero,zero,xmm2[3,u],zero,zero,zero,zero,xmm2[4,u],zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0],xmm3[1,2],xmm9[3],xmm3[4,5],xmm9[6],xmm3[7]
-; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm3
-; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128]
-; AVX-NEXT: vpshufb %xmm13, %xmm5, %xmm9
-; AVX-NEXT: vpor %xmm3, %xmm9, %xmm3
+; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[2,u],zero,zero,zero,zero,xmm2[3,u],zero,zero,zero,zero,xmm2[4,u],zero,zero
+; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm3[1,2],xmm8[3],xmm3[4,5],xmm8[6],xmm3[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5,6],zero,xmm3[8,9,10,11,12],zero,xmm3[14,15]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128]
+; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm8
+; AVX-NEXT: vpor %xmm3, %xmm8, %xmm3
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128]
-; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128]
+; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm2
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128]
-; AVX-NEXT: vpshufb %xmm15, %xmm5, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX-NEXT: vpand %xmm0, %xmm10, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128]
+; AVX-NEXT: vpshufb %xmm8, %xmm4, %xmm2
; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1]
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[1,0,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[1,0,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,1]
; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm3[3,3,3,3,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,4,4]
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
; AVX-NEXT: vandps %ymm6, %ymm2, %ymm2
-; AVX-NEXT: vandnps %ymm5, %ymm6, %ymm5
-; AVX-NEXT: vorps %ymm5, %ymm2, %ymm2
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[2,u],zero,zero,zero,zero,xmm1[3,u],zero,zero,zero,zero,xmm1[4,u],zero,zero
-; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1,2],xmm6[3],xmm5[4,5],xmm6[6],xmm5[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0],zero,xmm5[2,3,4,5,6],zero,xmm5[8,9,10,11,12],zero,xmm5[14,15]
-; AVX-NEXT: vpshufb %xmm13, %xmm8, %xmm6
-; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm5
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7]
-; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm2
-; AVX-NEXT: vpshufb %xmm15, %xmm8, %xmm5
-; AVX-NEXT: vpor %xmm5, %xmm2, %xmm2
+; AVX-NEXT: vandnps %ymm4, %ymm6, %ymm4
+; AVX-NEXT: vorps %ymm4, %ymm2, %ymm2
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[2,u],zero,zero,zero,zero,xmm5[3,u],zero,zero,zero,zero,xmm5[4,u],zero,zero
+; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0],xmm4[1,2],xmm6[3],xmm4[4,5],xmm6[6],xmm4[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0],zero,xmm4[2,3,4,5,6],zero,xmm4[8,9,10,11,12],zero,xmm4[14,15]
+; AVX-NEXT: vpshufb %xmm15, %xmm7, %xmm6
+; AVX-NEXT: vpor %xmm6, %xmm4, %xmm4
+; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb %xmm14, %xmm5, %xmm4
+; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2],xmm2[3,4],xmm4[5],xmm2[6,7]
+; AVX-NEXT: vpand %xmm2, %xmm10, %xmm2
+; AVX-NEXT: vpshufb %xmm8, %xmm7, %xmm4
+; AVX-NEXT: vpor %xmm4, %xmm2, %xmm2
; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[0,0,1,1]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,5,6,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[1,0,2,2,4,5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX-NEXT: vandnps %ymm0, %ymm15, %ymm0
-; AVX-NEXT: vandps %ymm2, %ymm15, %ymm2
-; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128]
-; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm3
-; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
-; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm9, %xmm14
-; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,8,128,128,128,128,128,9,128,128,128,128]
-; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm3
-; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u]
-; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm1
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,5,6,7,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,3]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[1,0,2,2,4,5,6,7]
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
+; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128]
+; AVX-NEXT: vpshufb %xmm6, %xmm5, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15]
+; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,8,128,128,128,128,128,9,128,128,128,128]
+; AVX-NEXT: vpshufb %xmm9, %xmm7, %xmm2
+; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u]
+; AVX-NEXT: vpshufb %xmm12, %xmm5, %xmm1
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128]
-; AVX-NEXT: vpshufb %xmm13, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7]
-; AVX-NEXT: vpshufb %xmm9, %xmm8, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0]
+; AVX-NEXT: vpand %xmm0, %xmm13, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,5,128,128,128,128,128,6,128,128,128,128,128,7]
+; AVX-NEXT: vpshufb %xmm14, %xmm7, %xmm1
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 16(%rsi), %xmm0
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -4155,22 +4156,23 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[1,0,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1]
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX-NEXT: vandnps %ymm1, %ymm15, %ymm1
-; AVX-NEXT: vandps %ymm2, %ymm15, %ymm2
+; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX-NEXT: vandnps %ymm1, %ymm7, %ymm1
+; AVX-NEXT: vandps %ymm7, %ymm2, %ymm2
; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
; AVX-NEXT: vmovdqa 16(%r8), %xmm3
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm5
; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7]
-; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm5
+; AVX-NEXT: vpshufb %xmm8, %xmm2, %xmm5
; AVX-NEXT: vmovdqa 16(%r9), %xmm2
-; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm6
+; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm6
; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5
; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm5
+; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm5
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6],xmm5[7]
-; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm5
+; AVX-NEXT: vpand %xmm1, %xmm13, %xmm1
+; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm5
; AVX-NEXT: vpor %xmm5, %xmm1, %xmm1
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,2]
@@ -4181,58 +4183,55 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX-NEXT: vandps %ymm0, %ymm15, %ymm0
-; AVX-NEXT: vandnps %ymm1, %ymm15, %ymm1
+; AVX-NEXT: vandps %ymm7, %ymm0, %ymm0
+; AVX-NEXT: vandnps %ymm1, %ymm7, %ymm1
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u]
-; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm4
+; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u]
+; AVX-NEXT: vpshufb %xmm13, %xmm3, %xmm4
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6],xmm4[7]
-; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15]
-; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm4
+; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128]
+; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,128,128,13,128,128,128,128,128,14,128,128,128,128,128,15]
+; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm4
; AVX-NEXT: vpor %xmm4, %xmm1, %xmm1
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [10,u,128,128,128,128,11,u,128,128,128,128,12,u,128,128]
-; AVX-NEXT: vpshufb %xmm13, %xmm3, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[10,u],zero,zero,zero,zero,xmm3[11,u],zero,zero,zero,zero,xmm3[12,u],zero,zero
; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15]
-; AVX-NEXT: vpshufb %xmm13, %xmm0, %xmm0
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm2[10],zero,zero,zero,zero,zero,xmm2[11],zero,zero,zero,zero,zero,xmm2[12],zero,zero
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa (%rsi), %xmm8
; AVX-NEXT: vmovdqa (%rdi), %xmm7
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[1,1,2,2]
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[1,1,2,2]
+; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[3,3,3,3]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: vmovdqa (%rcx), %xmm6
-; AVX-NEXT: vmovdqa (%rdx), %xmm5
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[3,3,3,3,4,5,6,7]
+; AVX-NEXT: vmovdqa (%rdx), %xmm4
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[3,3,3,3,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
-; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm9[0,1,2,3,5,6,7,7]
+; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,5,6,7,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,2,2,3]
; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
-; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
+; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0
+; AVX-NEXT: vandnps %ymm1, %ymm12, %ymm1
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm12
; AVX-NEXT: vmovdqa (%r8), %xmm1
-; AVX-NEXT: vpshufb %xmm14, %xmm1, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm12, %xmm14
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2,3],xmm0[4],xmm14[5,6],xmm0[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8],zero,xmm0[10,11,12,13,14],zero
+; AVX-NEXT: vpshufb %xmm13, %xmm1, %xmm0
+; AVX-NEXT: vextractf128 $1, %ymm12, %xmm13
+; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3],xmm0[4],xmm13[5,6],xmm0[7]
+; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm13
; AVX-NEXT: vmovdqa (%r9), %xmm0
-; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm15
-; AVX-NEXT: vpor %xmm15, %xmm14, %xmm4
-; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm15
+; AVX-NEXT: vpor %xmm15, %xmm13, %xmm13
+; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[10,u],zero,zero,zero,zero,xmm1[11,u],zero,zero,zero,zero,xmm1[12,u],zero,zero
; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2],xmm15[3],xmm12[4,5],xmm15[6],xmm12[7]
-; AVX-NEXT: vmovdqa %xmm13, %xmm4
-; AVX-NEXT: vpshufb %xmm13, %xmm12, %xmm12
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm12, %xmm12
; AVX-NEXT: vpshufb {{.*#+}} xmm15 = zero,xmm0[10],zero,zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero,zero,xmm0[12],zero,zero
; AVX-NEXT: vpor %xmm15, %xmm12, %xmm12
; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4244,24 +4243,24 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,3,3,3,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,4,4]
; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm15, %ymm10
-; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
-; AVX-NEXT: vandps %ymm13, %ymm11, %ymm11
-; AVX-NEXT: vandnps %ymm10, %ymm13, %ymm10
+; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
+; AVX-NEXT: vandps %ymm14, %ymm11, %ymm11
+; AVX-NEXT: vandnps %ymm10, %ymm14, %ymm10
; AVX-NEXT: vorps %ymm10, %ymm11, %ymm11
; AVX-NEXT: vextractf128 $1, %ymm11, %xmm10
; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128]
; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm15
; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1,2],xmm15[3],xmm10[4,5],xmm15[6],xmm10[7]
-; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm10
-; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128]
-; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm15
-; AVX-NEXT: vpor %xmm15, %xmm10, %xmm4
-; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0],zero,xmm10[2,3,4,5,6],zero,xmm10[8,9,10,11,12],zero,xmm10[14,15]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [128,2,128,128,128,128,128,3,128,128,128,128,128,4,128,128]
+; AVX-NEXT: vpshufb %xmm13, %xmm2, %xmm15
+; AVX-NEXT: vpor %xmm15, %xmm10, %xmm10
+; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128]
; AVX-NEXT: vpshufb %xmm15, %xmm3, %xmm3
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm11, %xmm3, %xmm3
+; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
+; AVX-NEXT: vpand %xmm3, %xmm11, %xmm3
; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [128,128,128,128,128,0,128,128,128,128,128,1,128,128,128,128]
; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm2
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
@@ -4269,33 +4268,32 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,0,1,1]
; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[1,1,2,2]
; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm3, %ymm3
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[1,0,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,0,1]
; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[3,3,3,3,4,5,6,7]
; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,4,4]
; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6
-; AVX-NEXT: vandps %ymm3, %ymm13, %ymm3
-; AVX-NEXT: vandnps %ymm6, %ymm13, %ymm6
+; AVX-NEXT: vandps %ymm3, %ymm14, %ymm3
+; AVX-NEXT: vandnps %ymm6, %ymm14, %ymm6
; AVX-NEXT: vorps %ymm6, %ymm3, %ymm6
; AVX-NEXT: vpshufb %xmm12, %xmm1, %xmm3
; AVX-NEXT: vextractf128 $1, %ymm6, %xmm8
; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm8[1,2],xmm3[3],xmm8[4,5],xmm3[6],xmm8[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0],zero,xmm3[2,3,4,5,6],zero,xmm3[8,9,10,11,12],zero,xmm3[14,15]
-; AVX-NEXT: vpshufb %xmm14, %xmm0, %xmm8
+; AVX-NEXT: vpshufb %xmm13, %xmm0, %xmm8
; AVX-NEXT: vpor %xmm3, %xmm8, %xmm3
; AVX-NEXT: vpshufb %xmm15, %xmm1, %xmm8
; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3,4],xmm8[5],xmm6[6,7]
-; AVX-NEXT: vpshufb %xmm11, %xmm6, %xmm6
+; AVX-NEXT: vpand %xmm6, %xmm11, %xmm6
; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm8
; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6
; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,3,3,3]
-; AVX-NEXT: vpermilps $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; AVX-NEXT: # xmm8 = mem[0,0,1,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,0,1,1]
; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,7,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,2,3]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[1,0,2,2,4,5,6,7]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,0,2,2,4,5,6,7]
; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,0,1]
; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
@@ -4305,12 +4303,12 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm1[8,u],zero,zero,zero,zero,xmm1[9,u],zero,zero,zero,zero
; AVX-NEXT: vextractf128 $1, %ymm4, %xmm7
; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1],xmm5[2],xmm7[3,4],xmm5[5],xmm7[6,7]
-; AVX-NEXT: vpshufb %xmm11, %xmm5, %xmm5
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,2,3,4],zero,xmm5[6,7,8,9,10],zero,xmm5[12,13,14,15]
; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[8],zero,zero,zero,zero,zero,xmm0[9],zero,zero,zero,zero
; AVX-NEXT: vpor %xmm7, %xmm5, %xmm5
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[5,u],zero,zero,zero,zero,xmm1[6,u],zero,zero,zero,zero,xmm1[7,u]
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2],zero,xmm1[4,5,6,7,8],zero,xmm1[10,11,12,13,14],zero
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,xmm0[7]
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
@@ -4333,9 +4331,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vmovaps %xmm0, 128(%rax)
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, 144(%rax)
-; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
-; AVX-NEXT: vmovaps %xmm0, 224(%rax)
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vmovaps %xmm0, 224(%rax)
+; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, 240(%rax)
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, 192(%rax)
@@ -4357,7 +4355,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vmovaps %xmm0, 320(%rax)
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, 336(%rax)
-; AVX-NEXT: addq $200, %rsp
+; AVX-NEXT: addq $184, %rsp
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 02ec9fc66feab..cf35f47866f29 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -2089,7 +2089,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ~mem)
; AVX512-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm5))
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[6,14,u,u,u],zero,zero,xmm3[7,15,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero
; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,14],zero,zero,xmm4[u,u,u,7,15],zero,zero,xmm4[u,u,u]
; AVX512-NEXT: vpor %xmm3, %xmm4, %xmm3
@@ -2156,7 +2156,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm8 & ~mem)
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5))
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u]
; AVX512-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
@@ -2226,7 +2226,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm7 & ~mem)
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm6 = ymm6 ^ (mem & (ymm6 ^ ymm5))
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[6,14,u,u,u],zero,zero,xmm3[7,15,u,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[6,14],zero,zero,zero,zero,zero,xmm3[7,15],zero,zero,zero
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,6,14],zero,zero,xmm4[u,u,u,7,15],zero,zero,xmm4[u,u,u]
; AVX512DQ-NEXT: vpor %xmm3, %xmm4, %xmm3
@@ -2293,7 +2293,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 | (ymm8 & ~mem)
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm7 = ymm7 ^ (mem & (ymm7 ^ ymm5))
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u],zero,zero,xmm4[6,14,u,u,u],zero,zero,xmm4[7,15,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm4[6,14],zero,zero,zero,zero,zero,xmm4[7,15],zero,zero,zero
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,6,14],zero,zero,xmm3[u,u,u,7,15],zero,zero,xmm3[u,u,u]
; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm3, %xmm3
@@ -3331,266 +3331,263 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX-LABEL: store_i8_stride7_vf32:
; AVX: # %bb.0:
-; AVX-NEXT: subq $216, %rsp
+; AVX-NEXT: subq $184, %rsp
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: vmovdqa 16(%rax), %xmm14
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,u],zero,zero,xmm14[9,u,u,u,u],zero,zero,xmm14[10,u,u,u]
-; AVX-NEXT: vmovdqa 16(%r9), %xmm2
-; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa 16(%r9), %xmm10
; AVX-NEXT: vmovdqa 16(%r8), %xmm3
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,3],zero,xmm1[u,u,u,u,4,5],zero,xmm1[u,u,u]
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u]
-; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u]
-; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm3
-; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u]
-; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u]
-; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm3
-; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u]
+; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u]
+; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm2
+; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u]
+; AVX-NEXT: vpshufb %xmm11, %xmm14, %xmm2
+; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm7
-; AVX-NEXT: vmovdqa 16(%rcx), %xmm1
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u]
-; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm0
-; AVX-NEXT: vmovdqa 16(%rdx), %xmm2
-; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u]
-; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm3
-; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm15
-; AVX-NEXT: vmovdqa 16(%rsi), %xmm10
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128]
-; AVX-NEXT: vpshufb %xmm1, %xmm10, %xmm0
+; AVX-NEXT: vmovdqa 16(%rcx), %xmm13
+; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u]
+; AVX-NEXT: vpshufb %xmm15, %xmm13, %xmm2
+; AVX-NEXT: vmovdqa 16(%rdx), %xmm12
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,7],zero,xmm12[u,u,u,u,u,8],zero,xmm12[u,u,u,u]
+; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm13[8],xmm12[9],xmm13[9],xmm12[10],xmm13[10],xmm12[11],xmm13[11],xmm12[12],xmm13[12],xmm12[13],xmm13[13],xmm12[14],xmm13[14],xmm12[15],xmm13[15]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm8
+; AVX-NEXT: vmovdqa 16(%rsi), %xmm9
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u,u],zero
; AVX-NEXT: vmovdqa 16(%rdi), %xmm6
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm6[u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u,u,9]
-; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm6[8],xmm10[9],xmm6[9],xmm10[10],xmm6[10],xmm10[11],xmm6[11],xmm10[12],xmm6[12],xmm10[13],xmm6[13],xmm10[14],xmm6[14],xmm10[15],xmm6[15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
-; AVX-NEXT: vandnps %ymm15, %ymm2, %ymm15
-; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX-NEXT: vorps %ymm0, %ymm15, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
-; AVX-NEXT: vandnps %ymm7, %ymm2, %ymm7
-; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u,u,9]
+; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
+; AVX-NEXT: vandnps %ymm8, %ymm1, %ymm8
+; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX-NEXT: vorps %ymm0, %ymm8, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
+; AVX-NEXT: vandnps %ymm7, %ymm1, %ymm7
+; AVX-NEXT: vandps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vorps %ymm7, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa (%r9), %xmm7
-; AVX-NEXT: vpshufb %xmm11, %xmm7, %xmm0
-; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa (%r9), %xmm2
+; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm0
+; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
; AVX-NEXT: vmovdqa (%r8), %xmm3
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb %xmm12, %xmm3, %xmm2
-; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa (%rax), %xmm8
-; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb %xmm5, %xmm8, %xmm2
-; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128]
-; AVX-NEXT: vpshufb %xmm5, %xmm8, %xmm2
-; AVX-NEXT: vmovdqa %xmm5, %xmm8
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
-; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[8,9],zero,xmm3[u,u,u,u,10,11],zero,xmm3[u,u,u,u,12,13]
-; AVX-NEXT: vpor %xmm2, %xmm5, %xmm2
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm15
-; AVX-NEXT: vmovdqa (%rcx), %xmm3
-; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm0
-; AVX-NEXT: vmovdqa (%rdx), %xmm13
-; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm2
-; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
+; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm1
+; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u]
+; AVX-NEXT: vmovdqa (%rax), %xmm4
+; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm1
+; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,u,u,u,u,u,10,11,u,u,u,u,u,12,13]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128]
+; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm5
+; AVX-NEXT: vpor %xmm5, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm11
+; AVX-NEXT: vmovdqa (%rcx), %xmm2
; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm0
+; AVX-NEXT: vmovdqa (%rdx), %xmm3
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u]
+; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
; AVX-NEXT: vmovdqa (%rsi), %xmm5
-; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u,u],zero
; AVX-NEXT: vmovdqa (%rdi), %xmm3
-; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9]
-; AVX-NEXT: vpor %xmm2, %xmm11, %xmm2
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa {{.*#+}} xmm12 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
-; AVX-NEXT: vpshufb %xmm12, %xmm9, %xmm11
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm11, %ymm11
+; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm3[u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u,u,9]
+; AVX-NEXT: vpor %xmm0, %xmm8, %xmm0
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
+; AVX-NEXT: vpshufb %xmm8, %xmm4, %xmm15
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm15, %ymm15
; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
-; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0
-; AVX-NEXT: vandps %ymm4, %ymm11, %ymm11
-; AVX-NEXT: vorps %ymm0, %ymm11, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
-; AVX-NEXT: vandnps %ymm15, %ymm11, %ymm15
-; AVX-NEXT: vandps %ymm0, %ymm11, %ymm0
-; AVX-NEXT: vorps %ymm0, %ymm15, %ymm0
+; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1
+; AVX-NEXT: vandps %ymm4, %ymm15, %ymm15
+; AVX-NEXT: vorps %ymm1, %ymm15, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm15 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
+; AVX-NEXT: vandnps %ymm11, %ymm15, %ymm11
+; AVX-NEXT: vandps %ymm1, %ymm15, %ymm1
+; AVX-NEXT: vorps %ymm1, %ymm11, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb %xmm8, %xmm14, %xmm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm7, %xmm14, %xmm1
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3],xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
-; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13]
-; AVX-NEXT: vpor %xmm0, %xmm11, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u]
-; AVX-NEXT: vpor %xmm11, %xmm15, %xmm11
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm11, %ymm15
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
-; AVX-NEXT: vpshufb %xmm12, %xmm11, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm11[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm12, %ymm12
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm9[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[8,9],zero,xmm0[u,u,u,u,10,11],zero,xmm0[u,u,u,u,12,13]
+; AVX-NEXT: vpor %xmm1, %xmm11, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,4,5,u,u,u,u,u,6,7,u,u,u,u,u]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm11, %xmm11
+; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u]
+; AVX-NEXT: vpor %xmm15, %xmm11, %xmm11
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm2
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
+; AVX-NEXT: vpshufb %xmm8, %xmm11, %xmm8
+; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm15, %ymm15
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm8[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0
-; AVX-NEXT: vandnps %ymm12, %ymm4, %ymm12
+; AVX-NEXT: vandnps %ymm15, %ymm4, %ymm14
; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX-NEXT: vorps %ymm0, %ymm12, %ymm0
+; AVX-NEXT: vorps %ymm0, %ymm14, %ymm0
; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
-; AVX-NEXT: vandnps %ymm15, %ymm4, %ymm12
+; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm1
; AVX-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX-NEXT: vorps %ymm0, %ymm12, %ymm0
+; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
-; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
+; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm15 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
-; AVX-NEXT: vpshufb %xmm15, %xmm4, %xmm4
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6
-; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
-; AVX-NEXT: vandnps %ymm0, %ymm4, %ymm0
-; AVX-NEXT: vandps %ymm4, %ymm6, %ymm6
-; AVX-NEXT: vorps %ymm0, %ymm6, %ymm0
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10]
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
-; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm12, %ymm6
-; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX-NEXT: vandps %ymm0, %ymm12, %ymm0
-; AVX-NEXT: vandnps %ymm6, %ymm12, %ymm6
-; AVX-NEXT: vorps %ymm6, %ymm0, %ymm0
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[0],zero,xmm6[2,3,4,5,6,7],zero,xmm6[9,10,11,12,13,14],zero
+; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm12
+; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
+; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
+; AVX-NEXT: vandnps %ymm4, %ymm0, %ymm4
+; AVX-NEXT: vandps %ymm0, %ymm12, %ymm12
+; AVX-NEXT: vorps %ymm4, %ymm12, %ymm4
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm7[8],xmm10[9],xmm7[9],xmm10[10],xmm7[10],xmm10[11],xmm7[11],xmm10[12],xmm7[12],xmm10[13],xmm7[13],xmm10[14],xmm7[14],xmm10[15],xmm7[15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10]
+; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12
+; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX-NEXT: vandps %ymm4, %ymm13, %ymm4
+; AVX-NEXT: vandnps %ymm12, %ymm13, %ymm12
+; AVX-NEXT: vorps %ymm4, %ymm12, %ymm4
+; AVX-NEXT: vextractf128 $1, %ymm4, %xmm12
+; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0],zero,xmm12[2,3,4,5,6,7],zero,xmm12[9,10,11,12,13,14],zero
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT: vpshufb {{.*#+}} xmm12 = zero,xmm14[13],zero,zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,zero,xmm14[15]
-; AVX-NEXT: vpor %xmm6, %xmm12, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7,8,9],zero,xmm0[11,12,13,14,15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,zero,xmm14[12],zero,zero,zero,zero,zero
-; AVX-NEXT: vpor %xmm6, %xmm0, %xmm0
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
-; AVX-NEXT: vpshufb %xmm10, %xmm12, %xmm2
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm13[8],xmm3[9],xmm13[9],xmm3[10],xmm13[10],xmm3[11],xmm13[11],xmm3[12],xmm13[12],xmm3[13],xmm13[13],xmm3[14],xmm13[14],xmm3[15],xmm13[15]
-; AVX-NEXT: vpshufb %xmm15, %xmm2, %xmm2
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm13 = zero,xmm14[13],zero,zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,zero,xmm14[15]
+; AVX-NEXT: vpor %xmm13, %xmm12, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[4,5,6,7,8,9],zero,xmm4[11,12,13,14,15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,zero,xmm14[12],zero,zero,zero,zero,zero
+; AVX-NEXT: vpor %xmm4, %xmm12, %xmm4
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
+; AVX-NEXT: vpshufb %xmm6, %xmm13, %xmm3
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
-; AVX-NEXT: vandnps %ymm1, %ymm5, %ymm1
-; AVX-NEXT: vandps %ymm5, %ymm2, %ymm2
-; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u]
+; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2
+; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1
+; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,u,u],zero,zero,xmm10[9,u,u,u,u],zero,zero,xmm10[10,u,u,u]
+; AVX-NEXT: vmovdqa (%rsp), %xmm12 # 16-byte Reload
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm12[8],xmm15[9],xmm12[9],xmm15[10],xmm12[10],xmm15[11],xmm12[11],xmm15[12],xmm12[12],xmm15[13],xmm12[13],xmm15[14],xmm12[14],xmm15[15],xmm12[15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u,2,3,u,u,u,u,u,4,5,u,u,u,u]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
; AVX-NEXT: vpor %xmm2, %xmm6, %xmm2
; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10]
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero
+; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm10[u],zero,zero,xmm10[11,u,u,u,u],zero,zero,xmm10[12,u,u,u,u],zero
; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5
; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1
; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2
; AVX-NEXT: vorps %ymm2, %ymm1, %ymm6
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
+; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm1
; AVX-NEXT: vpmovsxdq {{.*#+}} xmm2 = [16777216,197120]
-; AVX-NEXT: vpshufb %xmm2, %xmm9, %xmm3
+; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm3
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm12[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm13[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5]
; AVX-NEXT: vpshufb %xmm3, %xmm11, %xmm7
; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm5, %ymm5
-; AVX-NEXT: vandnps %ymm1, %ymm4, %ymm1
-; AVX-NEXT: vandps %ymm4, %ymm5, %ymm4
-; AVX-NEXT: vorps %ymm1, %ymm4, %ymm4
+; AVX-NEXT: vandnps %ymm1, %ymm0, %ymm1
+; AVX-NEXT: vandps %ymm0, %ymm5, %ymm0
+; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u]
-; AVX-NEXT: vmovdqa (%rsp), %xmm5 # 16-byte Reload
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5
; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3]
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10],zero,xmm7[u,u,u,u,13,12],zero,xmm7[u,u,u,u,15,14],zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[13,u,u,u,u],zero,zero,xmm8[14,u,u,u,u],zero,zero,xmm8[15]
-; AVX-NEXT: vpor %xmm7, %xmm9, %xmm7
-; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u]
-; AVX-NEXT: vpshufb %xmm9, %xmm5, %xmm5
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm12[8],xmm15[8],xmm12[9],xmm15[9],xmm12[10],xmm15[10],xmm12[11],xmm15[11],xmm12[12],xmm15[12],xmm12[13],xmm15[13],xmm12[14],xmm15[14],xmm12[15],xmm15[15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7, %xmm7
+; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm10[13,u,u,u,u],zero,zero,xmm10[14,u,u,u,u],zero,zero,xmm10[15]
+; AVX-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u]
+; AVX-NEXT: vpshufb %xmm8, %xmm5, %xmm5
; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5
; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
-; AVX-NEXT: vandps %ymm7, %ymm4, %ymm4
+; AVX-NEXT: vandps %ymm7, %ymm0, %ymm0
; AVX-NEXT: vandnps %ymm5, %ymm7, %ymm5
-; AVX-NEXT: vorps %ymm5, %ymm4, %ymm5
+; AVX-NEXT: vorps %ymm5, %ymm0, %ymm5
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
; AVX-NEXT: vpshufb %xmm2, %xmm7, %xmm2
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm7[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
; AVX-NEXT: vpshufb %xmm3, %xmm7, %xmm3
-; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0]
-; AVX-NEXT: vandnps %ymm2, %ymm4, %ymm2
-; AVX-NEXT: vandps %ymm4, %ymm3, %ymm3
-; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[2,u,u,u,u],zero,zero,xmm8[3,u,u,u,u]
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0]
+; AVX-NEXT: vandnps %ymm0, %ymm3, %ymm0
+; AVX-NEXT: vandps %ymm3, %ymm2, %ymm2
+; AVX-NEXT: vorps %ymm0, %ymm2, %ymm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,u],zero,zero,xmm10[2,u,u,u,u],zero,zero,xmm10[3,u,u,u,u]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm7[u,u,4,5],zero,xmm7[u,u,u,u,6,7],zero,xmm7[u,u,u,u]
-; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,4,5],zero,xmm7[u,u,u,u,6,7],zero,xmm7[u,u,u,u]
+; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX-NEXT: vpshufb %xmm1, %xmm7, %xmm1
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3]
-; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
-; AVX-NEXT: vandps %ymm3, %ymm2, %ymm2
-; AVX-NEXT: vandnps %ymm1, %ymm3, %ymm1
-; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3]
+; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
+; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovaps %ymm1, (%rax)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 128(%rax)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 32(%rax)
+; AVX-NEXT: vmovaps %ymm0, (%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm0, 128(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm0, 32(%rax)
; AVX-NEXT: vmovaps %ymm5, 96(%rax)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 160(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm0, 160(%rax)
; AVX-NEXT: vmovaps %ymm6, 64(%rax)
-; AVX-NEXT: vmovdqa %xmm0, 192(%rax)
+; AVX-NEXT: vmovdqa %xmm4, 192(%rax)
; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovaps %xmm0, 208(%rax)
-; AVX-NEXT: addq $216, %rsp
+; AVX-NEXT: addq $184, %rsp
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
@@ -6622,7 +6619,8 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,6,7],zero,xmm0[u,u,u,u,8,9],zero,xmm0[u,u,u,u,10]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,6,7,u,u,u,u,u,8,9,u,u,u,u,u,10]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX-NEXT: vmovdqa 16(%rsi), %xmm7
@@ -6656,19 +6654,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u]
; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm1
; AVX-NEXT: vmovdqa %xmm3, %xmm8
-; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u]
-; AVX-NEXT: vpshufb %xmm4, %xmm10, %xmm3
-; AVX-NEXT: vmovdqa %xmm4, %xmm10
+; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u]
+; AVX-NEXT: vpshufb %xmm5, %xmm10, %xmm3
; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u]
-; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u]
; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm3
+; AVX-NEXT: vmovdqa %xmm4, %xmm6
; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u]
-; AVX-NEXT: vpshufb %xmm3, %xmm12, %xmm1
-; AVX-NEXT: vmovdqa %xmm3, %xmm12
+; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u]
+; AVX-NEXT: vpshufb %xmm10, %xmm12, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u]
; AVX-NEXT: vpshufb %xmm9, %xmm13, %xmm3
; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1
@@ -6694,32 +6690,34 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX-NEXT: vorps %ymm0, %ymm1, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 32(%r8), %xmm3
-; AVX-NEXT: vmovdqa 32(%r9), %xmm11
-; AVX-NEXT: vpshufb %xmm8, %xmm11, %xmm0
-; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm1
+; AVX-NEXT: vmovdqa 32(%r8), %xmm11
+; AVX-NEXT: vmovdqa 32(%r9), %xmm4
+; AVX-NEXT: vpshufb %xmm8, %xmm4, %xmm0
+; AVX-NEXT: vpshufb %xmm5, %xmm11, %xmm1
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,u,u,u,5,6],zero,xmm0[u,u,u,u,12,13],zero,xmm0[u]
; AVX-NEXT: vmovdqa 32(%rax), %xmm8
-; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm1
+; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm1
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm8[4,u,u,u,u],zero,zero,xmm8[5,u,u,u,u],zero,zero
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3],xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; AVX-NEXT: vmovdqa %xmm4, %xmm12
+; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa %xmm3, %xmm10
-; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13]
-; AVX-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,u,u,u,u,u,10,11,u,u,u,u,u,12,13]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm8[4,u,u,u,u],zero,zero,xmm8[5,u,u,u,u],zero,zero
+; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm4
; AVX-NEXT: vmovdqa 32(%rcx), %xmm0
; AVX-NEXT: vmovdqa 32(%rdx), %xmm2
-; AVX-NEXT: vpshufb %xmm12, %xmm0, %xmm1
+; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm1
; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm3
; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
+; AVX-NEXT: vpshufb %xmm9, %xmm3, %xmm3
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm5
; AVX-NEXT: vmovdqa 32(%rsi), %xmm1
; AVX-NEXT: vmovdqa 32(%rdi), %xmm3
@@ -6728,12 +6726,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
-; AVX-NEXT: vpshufb %xmm9, %xmm7, %xmm7
+; AVX-NEXT: vmovdqa {{.*#+}} xmm10 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
+; AVX-NEXT: vpshufb %xmm10, %xmm7, %xmm7
; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
-; AVX-NEXT: vandnps %ymm5, %ymm7, %ymm5
-; AVX-NEXT: vandps %ymm7, %ymm6, %ymm6
+; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
+; AVX-NEXT: vandnps %ymm5, %ymm13, %ymm5
+; AVX-NEXT: vandps %ymm6, %ymm13, %ymm6
; AVX-NEXT: vorps %ymm5, %ymm6, %ymm5
; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
; AVX-NEXT: vandnps %ymm4, %ymm6, %ymm4
@@ -6742,10 +6740,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u],zero,zero,xmm8[11,u,u,u,u],zero,zero,xmm8[12,u,u,u,u],zero
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,6,7],zero,xmm5[u,u,u,u,8,9],zero,xmm5[u,u,u,u,10]
; AVX-NEXT: vpor %xmm4, %xmm6, %xmm4
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3],zero,xmm5[u,u,u,u,4,5],zero,xmm5[u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,2,3,u,u,u,u,u,4,5,u,u,u,u]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5, %xmm5
; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,u],zero,zero,xmm8[9,u,u,u,u],zero,zero,xmm8[10,u,u,u]
; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
@@ -6771,41 +6770,40 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa 48(%rax), %xmm12
+; AVX-NEXT: vmovdqa 48(%rax), %xmm14
; AVX-NEXT: vmovdqa 48(%r8), %xmm2
; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa 48(%r9), %xmm1
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128]
-; AVX-NEXT: vpshufb %xmm5, %xmm12, %xmm0
+; AVX-NEXT: vpshufb %xmm5, %xmm14, %xmm0
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13]
-; AVX-NEXT: vpshufb %xmm14, %xmm2, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13]
; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,4,5],zero,xmm2[u,u,u,u,6,7],zero,xmm2[u,u,u,u]
-; AVX-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,4,5,u,u,u,u,u,6,7,u,u,u,u,u]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[u,u],zero,zero,xmm14[2,u,u,u,u],zero,zero,xmm14[3,u,u,u,u]
+; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX-NEXT: vmovdqa 48(%rsi), %xmm1
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm10
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7]
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
+; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb %xmm9, %xmm2, %xmm1
+; AVX-NEXT: vpshufb %xmm10, %xmm2, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT: vmovdqa 48(%rcx), %xmm2
-; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
-; AVX-NEXT: vmovdqa 48(%rdx), %xmm13
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
+; AVX-NEXT: vmovdqa 48(%rcx), %xmm10
+; AVX-NEXT: vmovdqa 48(%rdx), %xmm12
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
; AVX-NEXT: vpshufb %xmm9, %xmm4, %xmm3
; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm11, %ymm3
-; AVX-NEXT: vandnps %ymm1, %ymm7, %ymm1
-; AVX-NEXT: vandps %ymm7, %ymm3, %ymm3
+; AVX-NEXT: vandnps %ymm1, %ymm13, %ymm1
+; AVX-NEXT: vandps %ymm3, %ymm13, %ymm3
; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1
; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0
@@ -6821,35 +6819,37 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vmovdqa (%rax), %xmm7
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[6,u,u,u,u],zero,zero,xmm7[7,u,u,u,u],zero,zero,xmm7[8,u]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb %xmm5, %xmm7, %xmm1
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm15
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,u,u,u,u,u,10,11,u,u,u,u,u,12,13]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX-NEXT: vpshufb %xmm5, %xmm7, %xmm15
; AVX-NEXT: vpor %xmm1, %xmm15, %xmm1
; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm15
; AVX-NEXT: vmovdqa (%rcx), %xmm5
; AVX-NEXT: vmovdqa (%rdx), %xmm3
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm5[u,u,u],zero,xmm5[7,u,u,u,u,u],zero,xmm5[8,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u]
-; AVX-NEXT: vpor %xmm0, %xmm14, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm3[u,u,u,7],zero,xmm3[u,u,u,u,u,8],zero,xmm3[u,u,u,u]
+; AVX-NEXT: vpor %xmm0, %xmm13, %xmm0
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm14
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm4
-; AVX-NEXT: vmovdqa (%rsi), %xmm14
+; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm13
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm13, %ymm4
+; AVX-NEXT: vmovdqa (%rsi), %xmm13
; AVX-NEXT: vmovdqa (%rdi), %xmm2
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm14[u],zero,xmm14[7,u,u,u,u,u],zero,xmm14[8,u,u,u,u,u],zero
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm13[u],zero,xmm13[7,u,u,u,u,u],zero,xmm13[8,u,u,u,u,u],zero
; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[u,7],zero,xmm2[u,u,u,u,u,8],zero,xmm2[u,u,u,u,u,9]
; AVX-NEXT: vpor %xmm1, %xmm11, %xmm1
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3],xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm0[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
+; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm11
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm11, %ymm1
-; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
-; AVX-NEXT: vandnps %ymm4, %ymm9, %ymm4
-; AVX-NEXT: vandps %ymm1, %ymm9, %ymm1
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
+; AVX-NEXT: vandnps %ymm4, %ymm0, %ymm4
+; AVX-NEXT: vandps %ymm0, %ymm1, %ymm1
; AVX-NEXT: vorps %ymm4, %ymm1, %ymm1
; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
; AVX-NEXT: vandnps %ymm15, %ymm0, %ymm4
@@ -6861,25 +6861,26 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm4[u,6,7],zero,xmm4[u,u,u,u,8,9],zero,xmm4[u,u,u,u,10]
; AVX-NEXT: vpor %xmm1, %xmm11, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3],zero,xmm4[u,u,u,u,4,5],zero,xmm4[u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,2,3,u,u,u,u,u,4,5,u,u,u,u]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm7[u,u,u],zero,zero,xmm7[9,u,u,u,u],zero,zero,xmm7[10,u,u,u]
; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
-; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
-; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
-; AVX-NEXT: vpshufb %xmm14, %xmm4, %xmm4
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm2[8],xmm13[8],xmm2[9],xmm13[9],xmm2[10],xmm13[10],xmm2[11],xmm13[11],xmm2[12],xmm13[12],xmm2[13],xmm13[13],xmm2[14],xmm13[14],xmm2[15],xmm13[15]
+; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
+; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm0
+; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
+; AVX-NEXT: vpshufb %xmm4, %xmm6, %xmm4
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
-; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm2
-; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
-; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
+; AVX-NEXT: vpshufb %xmm8, %xmm4, %xmm2
+; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
+; AVX-NEXT: vpshufb %xmm13, %xmm3, %xmm3
; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX-NEXT: vmovaps {{.*#+}} ymm3 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
; AVX-NEXT: vandnps %ymm0, %ymm3, %ymm0
@@ -6891,239 +6892,242 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
+; AVX-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm3[4,u,u,u,u],zero,zero,xmm3[5,u,u,u,u],zero,zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[8,9],zero,xmm1[u,u,u,u,10,11],zero,xmm1[u,u,u,u,12,13]
-; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[8,9],zero,xmm2[u,u,u,u,10,11],zero,xmm2[u,u,u,u,12,13]
+; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[u,u,4,5,u,u,u,u,u,6,7,u,u,u,u,u]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[u,u],zero,zero,xmm3[2,u,u,u,u],zero,zero,xmm3[3,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,4,5],zero,xmm1[u,u,u,u,6,7],zero,xmm1[u,u,u,u]
-; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
-; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm11
+; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload
+; AVX-NEXT: # xmm15 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm15[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4
-; AVX-NEXT: vandnps %ymm3, %ymm9, %ymm3
-; AVX-NEXT: vandps %ymm4, %ymm9, %ymm4
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
+; AVX-NEXT: vandnps %ymm3, %ymm0, %ymm3
+; AVX-NEXT: vandps %ymm0, %ymm4, %ymm4
; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
-; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
-; AVX-NEXT: vandnps %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vandps %ymm1, %ymm3, %ymm3
-; AVX-NEXT: vorps %ymm0, %ymm3, %ymm0
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
+; AVX-NEXT: vandnps %ymm2, %ymm0, %ymm2
+; AVX-NEXT: vandps %ymm0, %ymm3, %ymm3
+; AVX-NEXT: vorps %ymm2, %ymm3, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa (%rsp), %xmm9 # 16-byte Reload
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm9[u,u,u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm13[u,u,u,7],zero,xmm13[u,u,u,u,u,8],zero,xmm13[u,u,u,u]
-; AVX-NEXT: vpor %xmm0, %xmm3, %xmm3
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
-; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm4
+; AVX-NEXT: vmovdqa %xmm10, %xmm6
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm10[u,u,u],zero,xmm10[7,u,u,u,u,u],zero,xmm10[8,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm12[u,u,u,7],zero,xmm12[u,u,u,u,u,8],zero,xmm12[u,u,u,u]
+; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15]
+; AVX-NEXT: vpshufb %xmm13, %xmm10, %xmm3
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm9[u],zero,xmm9[7,u,u,u,u,u],zero,xmm9[8,u,u,u,u,u],zero
+; AVX-NEXT: vmovdqa (%rsp), %xmm11 # 16-byte Reload
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm11[u,7],zero,xmm11[u,u,u,u,u,8],zero,xmm11[u,u,u,u,u,9]
+; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15]
+; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm4
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm8[u],zero,xmm8[7,u,u,u,u,u],zero,xmm8[8,u,u,u,u,u],zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,7],zero,xmm10[u,u,u,u,u,8],zero,xmm10[u,u,u,u,u,9]
-; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm10[8],xmm8[9],xmm10[9],xmm8[10],xmm10[10],xmm8[11],xmm10[11],xmm8[12],xmm10[12],xmm8[13],xmm10[13],xmm8[14],xmm10[14],xmm8[15],xmm10[15]
-; AVX-NEXT: vpshufb %xmm6, %xmm11, %xmm11
-; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4
-; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
-; AVX-NEXT: vandnps %ymm3, %ymm1, %ymm3
-; AVX-NEXT: vandps %ymm1, %ymm4, %ymm4
-; AVX-NEXT: vorps %ymm3, %ymm4, %ymm3
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2],zero,xmm4[u,u,6,7,8,9],zero,xmm4[u,u,13,14,15]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,xmm6[9,u,u],zero,zero,zero,zero,xmm6[10,u,u],zero,zero,zero
-; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[u,6,7,8,9,10],zero,xmm4[u,13,14,15]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm5[9,u],zero,zero,zero,zero,zero,xmm5[10,u],zero,zero,zero
-; AVX-NEXT: vpor %xmm4, %xmm11, %xmm4
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3,4],zero,xmm4[6,7,8,9,10,11],zero,xmm4[13,14,15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,zero,xmm12[9],zero,zero,zero,zero,zero,zero,xmm12[10],zero,zero,zero
-; AVX-NEXT: vpor %xmm4, %xmm11, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4],zero,xmm3[u,u,8,9,10,11],zero,xmm3[u,u,15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u],zero,zero,zero,zero,xmm6[7,u,u],zero,zero,zero,zero,xmm6[8,u,u],zero
+; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
+; AVX-NEXT: vandnps %ymm2, %ymm0, %ymm2
+; AVX-NEXT: vandps %ymm0, %ymm3, %ymm3
+; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2],zero,xmm3[u,u,6,7,8,9],zero,xmm3[u,u,13,14,15]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm1[9,u,u],zero,zero,zero,zero,xmm1[10,u,u],zero,zero,zero
; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,1,2,3,4,5],zero,xmm3[u,8,9,10,11,12],zero,xmm3[u,15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u],zero,zero,zero,zero,zero,xmm5[7,u],zero,zero,zero,zero,zero,xmm5[8,u],zero
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3],zero,xmm3[u,6,7,8,9,10],zero,xmm3[u,13,14,15]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,xmm5[9,u],zero,zero,zero,zero,zero,xmm5[10,u],zero,zero,zero
; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm3[1,2,3,4,5,6],zero,xmm3[8,9,10,11,12,13],zero,xmm3[15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm12[6],zero,zero,zero,zero,zero,zero,xmm12[7],zero,zero,zero,zero,zero,zero,xmm12[8],zero
-; AVX-NEXT: vpor %xmm4, %xmm3, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
-; AVX-NEXT: vpshufb %xmm14, %xmm3, %xmm4
-; AVX-NEXT: vpmovsxdq {{.*#+}} xmm10 = [218890240,986624]
-; AVX-NEXT: vpshufb %xmm10, %xmm3, %xmm3
-; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15]
-; AVX-NEXT: vpshufb %xmm7, %xmm4, %xmm4
-; AVX-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
-; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm0
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
-; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
-; AVX-NEXT: vandnps %ymm3, %ymm8, %ymm3
-; AVX-NEXT: vandps %ymm0, %ymm8, %ymm0
-; AVX-NEXT: vorps %ymm3, %ymm0, %ymm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0],zero,xmm0[u,u,4,5,6,7],zero,xmm0[u,u,11,12,13,14],zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm6[11,u,u],zero,zero,zero,zero,xmm6[12,u,u],zero,zero,zero,zero,xmm6[13]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4],zero,xmm3[6,7,8,9,10,11],zero,xmm3[13,14,15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm14[9],zero,zero,zero,zero,zero,zero,xmm14[10],zero,zero,zero
+; AVX-NEXT: vpor %xmm4, %xmm3, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,1,2,3,4],zero,xmm2[u,u,8,9,10,11],zero,xmm2[u,u,15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[u],zero,zero,zero,zero,xmm1[7,u,u],zero,zero,zero,zero,xmm1[8,u,u],zero
+; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,1,2,3,4,5],zero,xmm2[u,8,9,10,11,12],zero,xmm2[u,15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[u],zero,zero,zero,zero,zero,xmm5[7,u],zero,zero,zero,zero,zero,xmm5[8,u],zero
+; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm14[6],zero,zero,zero,zero,zero,zero,xmm14[7],zero,zero,zero,zero,zero,zero,xmm14[8],zero
+; AVX-NEXT: vpor %xmm3, %xmm2, %xmm0
+; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
+; AVX-NEXT: vpmovsxdq {{.*#+}} xmm0 = [218890240,986624]
+; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm2
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm12[8],xmm6[9],xmm12[9],xmm6[10],xmm12[10],xmm6[11],xmm12[11],xmm6[12],xmm12[12],xmm6[13],xmm12[13],xmm6[14],xmm12[14],xmm6[15],xmm12[15]
+; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3
+; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
+; AVX-NEXT: vpshufb %xmm7, %xmm10, %xmm4
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
+; AVX-NEXT: vandnps %ymm2, %ymm6, %ymm2
+; AVX-NEXT: vandps %ymm6, %ymm3, %ymm3
+; AVX-NEXT: vorps %ymm2, %ymm3, %ymm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0],zero,xmm2[u,u,4,5,6,7],zero,xmm2[u,u,11,12,13,14],zero
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm1[11,u,u],zero,zero,zero,zero,xmm1[12,u,u],zero,zero,zero,zero,xmm1[13]
; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[u,4,5,6,7,8],zero,xmm3[u,11,12,13,14,15]
; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm5[11,u],zero,zero,zero,zero,zero,xmm5[12,u],zero,zero,zero,zero,zero
; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,2],zero,xmm3[4,5,6,7,8,9],zero,xmm3[11,12,13,14,15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm12[11],zero,zero,zero,zero,zero,zero,xmm12[12],zero,zero,zero,zero,zero
-; AVX-NEXT: vpor %xmm4, %xmm3, %xmm1
-; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[u,4,5,6,7,0],zero,xmm0[u,11,12,13,14,1],zero,xmm0[u]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm14[11],zero,zero,zero,zero,zero,zero,xmm14[12],zero,zero,zero,zero,zero
+; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
+; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[u,4,5,6,7,0],zero,xmm2[u,11,12,13,14,1],zero,xmm2[u]
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[13,u],zero,zero,zero,zero,zero,xmm5[14,u],zero,zero,zero,zero,zero,xmm5[15,u]
-; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2,3,4,5,6,7],zero,xmm0[9,10,11,12,13,14],zero
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm12[13],zero,zero,zero,zero,zero,zero,xmm12[14],zero,zero,zero,zero,zero,zero,xmm12[15]
-; AVX-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT: vpmovsxdq {{.*#+}} xmm15 = [16777216,197120]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm0
+; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0],zero,xmm2[2,3,4,5,6,7],zero,xmm2[9,10,11,12,13,14],zero
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm14[13],zero,zero,zero,zero,zero,zero,xmm14[14],zero,zero,zero,zero,zero,zero,xmm14[15]
+; AVX-NEXT: vpor %xmm3, %xmm2, %xmm1
+; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT: vpmovsxdq {{.*#+}} xmm10 = [16777216,197120]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm9, %xmm1, %xmm3
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm2
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm7, %xmm1, %xmm3
+; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5]
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm4
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm10, %xmm1, %xmm7
+; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm7
; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm7, %ymm4
-; AVX-NEXT: vandnps %ymm0, %ymm8, %ymm0
-; AVX-NEXT: vandps %ymm4, %ymm8, %ymm4
-; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
-; AVX-NEXT: # xmm4 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10],zero,xmm4[u,u,u,u,13,12],zero,xmm4[u,u,u,u,15,14],zero
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm14[13,u,u,u,u],zero,zero,xmm14[14,u,u,u,u],zero,zero,xmm14[15]
-; AVX-NEXT: vpor %xmm7, %xmm4, %xmm7
-; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm10
-; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm10[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u]
-; AVX-NEXT: vpshufb %xmm11, %xmm5, %xmm5
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm7, %ymm5
-; AVX-NEXT: vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
-; AVX-NEXT: vandps %ymm7, %ymm0, %ymm0
-; AVX-NEXT: vandnps %ymm5, %ymm7, %ymm5
-; AVX-NEXT: vorps %ymm5, %ymm0, %ymm0
+; AVX-NEXT: vandnps %ymm2, %ymm6, %ymm2
+; AVX-NEXT: vandps %ymm6, %ymm4, %ymm4
+; AVX-NEXT: vorps %ymm2, %ymm4, %ymm2
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
+; AVX-NEXT: # xmm7 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm9 = xmm7[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm9, %xmm9
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,xmm7[13,u,u,u,u],zero,zero,xmm7[14,u,u,u,u],zero,zero,xmm7[15]
+; AVX-NEXT: vpor %xmm11, %xmm9, %xmm9
+; AVX-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u]
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm11, %xmm0, %xmm12
+; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3]
+; AVX-NEXT: vmovdqa {{.*#+}} xmm13 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u]
+; AVX-NEXT: vpshufb %xmm13, %xmm5, %xmm5
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5
+; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
+; AVX-NEXT: vandps %ymm2, %ymm12, %ymm2
+; AVX-NEXT: vandnps %ymm5, %ymm12, %ymm5
+; AVX-NEXT: vorps %ymm5, %ymm2, %ymm0
; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm5
-; AVX-NEXT: vpshufb %xmm15, %xmm8, %xmm10
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
+; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm5
+; AVX-NEXT: vpshufb %xmm10, %xmm0, %xmm9
+; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm9, %ymm5
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
-; AVX-NEXT: vpshufb %xmm6, %xmm8, %xmm10
-; AVX-NEXT: vpshufb %xmm3, %xmm8, %xmm12
-; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10
-; AVX-NEXT: vmovaps {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0]
-; AVX-NEXT: vandnps %ymm5, %ymm12, %ymm5
-; AVX-NEXT: vandps %ymm12, %ymm10, %ymm10
-; AVX-NEXT: vorps %ymm5, %ymm10, %ymm5
+; AVX-NEXT: vpshufb %xmm6, %xmm0, %xmm9
+; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm14
+; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm14, %ymm9
+; AVX-NEXT: vmovaps {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0]
+; AVX-NEXT: vandnps %ymm5, %ymm14, %ymm5
+; AVX-NEXT: vandps %ymm14, %ymm9, %ymm9
+; AVX-NEXT: vorps %ymm5, %ymm9, %ymm5
; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,128,128,2,u,u,u,u,128,128,3,u,u,u,u]
-; AVX-NEXT: vpshufb %xmm0, %xmm14, %xmm10
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm0, %xmm7, %xmm9
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,128,u,u,u,u,6,7,128,u,u,u,u]
-; AVX-NEXT: vpshufb %xmm1, %xmm8, %xmm13
-; AVX-NEXT: vpor %xmm10, %xmm13, %xmm10
-; AVX-NEXT: vpshufb %xmm4, %xmm8, %xmm13
-; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3]
-; AVX-NEXT: vpshufb %xmm11, %xmm13, %xmm13
-; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm13, %ymm10
-; AVX-NEXT: vmovaps {{.*#+}} ymm13 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
-; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5
-; AVX-NEXT: vandnps %ymm10, %ymm13, %ymm10
-; AVX-NEXT: vorps %ymm5, %ymm10, %ymm10
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm5
-; AVX-NEXT: vpshufb %xmm15, %xmm8, %xmm14
-; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm14, %ymm5
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm14
-; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm8
-; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm8, %ymm8
-; AVX-NEXT: vandnps %ymm5, %ymm12, %ymm5
-; AVX-NEXT: vandps %ymm12, %ymm8, %ymm8
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm1, %xmm4, %xmm8
+; AVX-NEXT: vpor %xmm9, %xmm8, %xmm8
+; AVX-NEXT: vpshufb %xmm11, %xmm4, %xmm9
+; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[4,5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3]
+; AVX-NEXT: vpshufb %xmm13, %xmm9, %xmm9
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
+; AVX-NEXT: vmovaps {{.*#+}} ymm9 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
+; AVX-NEXT: vandps %ymm5, %ymm9, %ymm5
+; AVX-NEXT: vandnps %ymm8, %ymm9, %ymm8
; AVX-NEXT: vorps %ymm5, %ymm8, %ymm5
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm2, %xmm4, %xmm8
+; AVX-NEXT: vpshufb %xmm10, %xmm4, %xmm4
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm4, %ymm4
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm6, %xmm2, %xmm8
+; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm7
+; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7
+; AVX-NEXT: vandnps %ymm4, %ymm14, %ymm4
+; AVX-NEXT: vandps %ymm7, %ymm14, %ymm7
+; AVX-NEXT: vorps %ymm4, %ymm7, %ymm4
; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm8
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm12
-; AVX-NEXT: vpor %xmm8, %xmm12, %xmm8
-; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm12
-; AVX-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[4,5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3]
-; AVX-NEXT: vpshufb %xmm11, %xmm12, %xmm12
-; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm12, %ymm8
-; AVX-NEXT: vandps %ymm5, %ymm13, %ymm5
-; AVX-NEXT: vandnps %ymm8, %ymm13, %ymm8
-; AVX-NEXT: vorps %ymm5, %ymm8, %ymm5
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm15, %xmm0, %xmm8
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm9, %xmm0, %xmm9
-; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm2
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
-; AVX-NEXT: vandnps %ymm8, %ymm0, %ymm3
-; AVX-NEXT: vandps %ymm0, %ymm2, %ymm2
-; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2
+; AVX-NEXT: vpshufb %xmm0, %xmm6, %xmm7
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm0
+; AVX-NEXT: vpor %xmm7, %xmm0, %xmm0
+; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm7
+; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[4,5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3]
+; AVX-NEXT: vpshufb %xmm13, %xmm7, %xmm7
+; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0
+; AVX-NEXT: vandps %ymm4, %ymm9, %ymm4
+; AVX-NEXT: vandnps %ymm0, %ymm9, %ymm0
+; AVX-NEXT: vorps %ymm0, %ymm4, %ymm0
+; AVX-NEXT: vpshufb %xmm10, %xmm15, %xmm1
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
+; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm3
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
+; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
+; AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1
+; AVX-NEXT: vandps %ymm2, %ymm3, %ymm3
+; AVX-NEXT: vorps %ymm1, %ymm3, %ymm1
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm6[13,u,u,u,u],zero,zero,xmm6[14,u,u,u,u],zero,zero,xmm6[15]
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
-; AVX-NEXT: # xmm6 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15]
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10],zero,xmm6[u,u,u,u,13,12],zero,xmm6[u,u,u,u,15,14],zero
-; AVX-NEXT: vpor %xmm3, %xmm6, %xmm3
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm1
-; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3]
-; AVX-NEXT: vpshufb %xmm11, %xmm1, %xmm1
-; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX-NEXT: vandps %ymm7, %ymm2, %ymm2
-; AVX-NEXT: vandnps %ymm1, %ymm7, %ymm1
-; AVX-NEXT: vorps %ymm1, %ymm2, %ymm1
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload
+; AVX-NEXT: # xmm4 = xmm2[8],mem[8],xmm2[9],mem[9],xmm2[10],mem[10],xmm2[11],mem[11],xmm2[12],mem[12],xmm2[13],mem[13],xmm2[14],mem[14],xmm2[15],mem[15]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vpshufb %xmm11, %xmm2, %xmm4
+; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
+; AVX-NEXT: vpshufb %xmm13, %xmm4, %xmm4
+; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX-NEXT: vandps %ymm1, %ymm12, %ymm1
+; AVX-NEXT: vandnps %ymm3, %ymm12, %ymm3
+; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1
; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm0, 128(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm2, 128(%rax)
; AVX-NEXT: vmovaps %ymm1, 96(%rax)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm0, 64(%rax)
-; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm0, 32(%rax)
-; AVX-NEXT: vmovaps %ymm5, (%rax)
-; AVX-NEXT: vmovaps %ymm10, 224(%rax)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX-NEXT: vmovaps %ymm1, 352(%rax)
+; AVX-NEXT: vmovaps %ymm1, 64(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm1, 32(%rax)
+; AVX-NEXT: vmovaps %ymm0, (%rax)
+; AVX-NEXT: vmovaps %ymm5, 224(%rax)
+; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
+; AVX-NEXT: vmovaps %ymm0, 352(%rax)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX-NEXT: vmovaps %ymm0, 320(%rax)
; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -8403,379 +8407,374 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovdqa (%rsi), %ymm3
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm23
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm22
; AVX512-NEXT: vmovdqa (%rdi), %ymm10
; AVX512-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
; AVX512-NEXT: vpshufb %ymm2, %ymm10, %ymm1
-; AVX512-NEXT: vmovdqa64 %ymm2, %ymm16
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm17
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa (%rcx), %ymm8
+; AVX512-NEXT: vmovdqa (%rcx), %ymm9
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm1, %ymm8, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512-NEXT: vmovdqa (%rdx), %ymm7
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512-NEXT: vpshufb %ymm11, %ymm7, %ymm1
+; AVX512-NEXT: vpshufb %ymm1, %ymm9, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm16
+; AVX512-NEXT: vmovdqa (%rdx), %ymm8
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512-NEXT: vpshufb %ymm12, %ymm8, %ymm1
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa (%r8), %ymm6
+; AVX512-NEXT: vmovdqa (%r8), %ymm15
; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512-NEXT: vpshufb %ymm1, %ymm6, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm24
-; AVX512-NEXT: vmovdqa (%r9), %ymm5
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
-; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm2, %ymm5, %ymm1
-; AVX512-NEXT: vmovdqa64 %ymm2, %ymm25
+; AVX512-NEXT: vpshufb %ymm1, %ymm15, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19
+; AVX512-NEXT: vmovdqa (%r9), %ymm7
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
+; AVX512-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512-NEXT: vpshufb %ymm5, %ymm7, %ymm1
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
-; AVX512-NEXT: # ymm14 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm14, %ymm3, %ymm1
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm17
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
-; AVX512-NEXT: vpshufb %ymm0, %ymm10, %ymm2
-; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
-; AVX512-NEXT: # ymm15 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm15, %ymm7, %ymm1
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
-; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm2, %ymm8, %ymm3
-; AVX512-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
-; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm3, %ymm6, %ymm1
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
-; AVX512-NEXT: # ymm9 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm9, %ymm5, %ymm13
-; AVX512-NEXT: vpor %ymm1, %ymm13, %ymm1
-; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 32(%rsi), %ymm13
-; AVX512-NEXT: vpshufb %ymm14, %ymm13, %ymm1
-; AVX512-NEXT: vmovdqa 32(%rdi), %ymm14
-; AVX512-NEXT: vpshufb %ymm0, %ymm14, %ymm0
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero,zero,ymm14[27],zero,ymm14[25]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm13[23,u,u,u],zero,ymm13[26],zero,ymm13[24,u,u,u],zero,ymm13[27],zero
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
+; AVX512-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512-NEXT: vpshufb %ymm11, %ymm3, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm3, %ymm23
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512-NEXT: vpshufb %ymm14, %ymm10, %ymm1
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 32(%rdx), %ymm1
-; AVX512-NEXT: vpshufb %ymm15, %ymm1, %ymm15
-; AVX512-NEXT: vmovdqa 32(%rcx), %ymm0
-; AVX512-NEXT: vpshufb %ymm2, %ymm0, %ymm2
-; AVX512-NEXT: vpor %ymm2, %ymm15, %ymm2
-; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512-NEXT: # ymm12 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm12, %ymm1, %ymm15
-; AVX512-NEXT: vmovdqa64 %ymm12, %ymm22
-; AVX512-NEXT: vpor %ymm2, %ymm15, %ymm2
-; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa 32(%r8), %ymm15
-; AVX512-NEXT: vpshufb %ymm3, %ymm15, %ymm3
-; AVX512-NEXT: vmovdqa 32(%r9), %ymm2
-; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm9
-; AVX512-NEXT: vpor %ymm3, %ymm9, %ymm3
-; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512-NEXT: # ymm9 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm9, %ymm2, %ymm3
-; AVX512-NEXT: vmovdqa64 %ymm9, %ymm21
-; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm15[23],zero,ymm15[23,24,25,26],zero,ymm15[24],zero,ymm15[30,31]
-; AVX512-NEXT: vpor %ymm3, %ymm9, %ymm3
-; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: vmovdqa 32(%rax), %ymm9
-; AVX512-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
+; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512-NEXT: vpshufb %ymm2, %ymm8, %ymm0
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm4, %ymm9, %ymm12
-; AVX512-NEXT: vmovdqa64 %ymm4, %ymm20
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm12[2,3,2,3],zmm3[2,3,2,3]
-; AVX512-NEXT: vmovdqa64 %ymm18, %ymm3
-; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3
-; AVX512-NEXT: vpshufb %ymm11, %ymm1, %ymm11
-; AVX512-NEXT: vpor %ymm3, %ymm11, %ymm3
+; AVX512-NEXT: vpshufb %ymm4, %ymm9, %ymm3
+; AVX512-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512-NEXT: vpshufb %ymm0, %ymm15, %ymm3
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
+; AVX512-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512-NEXT: vpshufb %ymm1, %ymm7, %ymm6
+; AVX512-NEXT: vpor %ymm3, %ymm6, %ymm3
; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512-NEXT: # ymm9 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm9, %ymm1, %ymm1
-; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm23, %ymm0
+; AVX512-NEXT: vmovdqa 32(%r8), %ymm13
; AVX512-NEXT: vpshufb %ymm0, %ymm13, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm16, %ymm1
-; AVX512-NEXT: vpshufb %ymm1, %ymm14, %ymm1
+; AVX512-NEXT: vmovdqa 32(%r9), %ymm6
+; AVX512-NEXT: vpshufb %ymm1, %ymm6, %ymm1
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
-; AVX512-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm1, %ymm13, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm26
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512-NEXT: vpshufb %ymm13, %ymm14, %ymm1
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm13[23],zero,ymm13[23,24,25,26],zero,ymm13[24],zero,ymm13[30,31]
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa 32(%rdx), %ymm3
+; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm0
+; AVX512-NEXT: vmovdqa 32(%rcx), %ymm2
+; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm1
+; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
; AVX512-NEXT: # ymm1 = mem[0,1,0,1]
; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512-NEXT: vpshufb %ymm3, %ymm15, %ymm1
-; AVX512-NEXT: vmovdqa64 %ymm3, %ymm19
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm21
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
+; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm1
+; AVX512-NEXT: vmovdqa64 %ymm4, %ymm20
; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm24, %ymm0
-; AVX512-NEXT: vpshufb %ymm0, %ymm15, %ymm0
-; AVX512-NEXT: vmovdqa64 %ymm25, %ymm1
-; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vmovdqa 32(%rsi), %ymm1
+; AVX512-NEXT: vpshufb %ymm11, %ymm1, %ymm4
+; AVX512-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX512-NEXT: vpshufb %ymm14, %ymm0, %ymm11
+; AVX512-NEXT: vpor %ymm4, %ymm11, %ymm4
+; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm0[7],zero,zero,zero,zero,ymm0[10],zero,ymm0[8],zero,zero,zero,zero,ymm0[11],zero,ymm0[9],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[9],zero,ymm1[7,u,u,u],zero,ymm1[10],zero,ymm1[8,u,u,u],zero,ymm1[11],zero,ymm1[25],zero,ymm1[23,u,u,u],zero,ymm1[26],zero,ymm1[24,u,u,u],zero,ymm1[27],zero
+; AVX512-NEXT: vpor %ymm4, %ymm11, %ymm4
+; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT: vmovdqa 32(%rax), %ymm4
+; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm14[2,3,2,3],zmm11[2,3,2,3]
+; AVX512-NEXT: vmovdqa64 %ymm16, %ymm4
+; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm11
+; AVX512-NEXT: vpshufb %ymm12, %ymm3, %ymm12
+; AVX512-NEXT: vpor %ymm11, %ymm12, %ymm4
+; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX512-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512-NEXT: vpshufb %ymm11, %ymm2, %ymm2
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
+; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX512-NEXT: vmovdqa64 %ymm4, %ymm18
+; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vmovdqa64 %ymm22, %ymm2
+; AVX512-NEXT: vpshufb %ymm2, %ymm1, %ymm2
+; AVX512-NEXT: vmovdqa64 %ymm17, %ymm3
+; AVX512-NEXT: vpshufb %ymm3, %ymm0, %ymm3
+; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512-NEXT: vpshufb %ymm12, %ymm0, %ymm0
+; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
+; AVX512-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512-NEXT: vpshufb %ymm1, %ymm6, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm1, %ymm16
+; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512-NEXT: vpshufb %ymm2, %ymm13, %ymm1
+; AVX512-NEXT: vmovdqa64 %ymm2, %ymm17
+; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512-NEXT: vpshufb %ymm0, %ymm13, %ymm0
+; AVX512-NEXT: vpshufb %ymm5, %ymm6, %ymm1
+; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vmovdqa (%rdi), %xmm2
-; AVX512-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512-NEXT: vpshufb %xmm12, %xmm1, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm16
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512-NEXT: vmovdqa (%rsi), %xmm0
+; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm5, %xmm19
+; AVX512-NEXT: vmovdqa64 %xmm2, %xmm27
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512-NEXT: vmovdqa (%rcx), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm31
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
; AVX512-NEXT: vpshufb %xmm14, %xmm2, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm28
+; AVX512-NEXT: vmovdqa64 %xmm2, %xmm30
; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa (%rdx), %xmm3
-; AVX512-NEXT: vmovdqa (%rcx), %xmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm1
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm31
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512-NEXT: vpshufb %xmm0, %xmm3, %xmm2
-; AVX512-NEXT: vmovdqa64 %xmm3, %xmm30
-; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-NEXT: vmovdqa (%r9), %xmm1
-; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512-NEXT: vmovdqa (%r8), %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[4,u,u,u],zero,xmm1[7],zero,xmm1[5,u,u,u],zero,xmm1[8],zero,xmm1[6]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[4],zero,xmm3[u,u,u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero
-; AVX512-NEXT: vmovdqa64 %xmm3, %xmm23
-; AVX512-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero,ymm10[27],zero,ymm10[25]
-; AVX512-NEXT: vmovdqa64 %ymm17, %ymm3
-; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
-; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm26, %ymm1
-; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm1
-; AVX512-NEXT: vpshufb %ymm13, %ymm10, %ymm2
-; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero
-; AVX512-NEXT: vmovdqa64 %ymm22, %ymm2
-; AVX512-NEXT: vpshufb %ymm2, %ymm7, %ymm2
-; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufb %ymm4, %ymm8, %ymm1
-; AVX512-NEXT: vpshufb %ymm9, %ymm7, %ymm2
-; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa64 %ymm21, %ymm1
-; AVX512-NEXT: vpshufb %ymm1, %ymm5, %ymm1
-; AVX512-NEXT: vmovdqa64 %ymm18, %ymm2
-; AVX512-NEXT: vpshufb %ymm2, %ymm5, %ymm2
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
-; AVX512-NEXT: vmovdqa64 %ymm19, %ymm2
-; AVX512-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-NEXT: vmovdqa (%rax), %ymm4
+; AVX512-NEXT: vmovdqa (%r8), %xmm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm0
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm28
+; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4],zero,xmm2[u,u,u,7],zero,xmm2[5],zero,xmm2[u,u,u,8],zero,xmm2[6],zero
+; AVX512-NEXT: vmovdqa64 %xmm2, %xmm22
+; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 %ymm21, %ymm0
+; AVX512-NEXT: vpshufb %ymm0, %ymm9, %ymm0
; AVX512-NEXT: vmovdqa64 %ymm20, %ymm1
-; AVX512-NEXT: vpshufb %ymm1, %ymm4, %ymm2
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm18
+; AVX512-NEXT: vpshufb %ymm1, %ymm8, %ymm1
+; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufb %ymm11, %ymm9, %ymm0
+; AVX512-NEXT: vmovdqa64 %ymm18, %ymm1
+; AVX512-NEXT: vpshufb %ymm1, %ymm8, %ymm1
+; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa64 %ymm23, %ymm2
+; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm0
+; AVX512-NEXT: vpshufb %ymm12, %ymm10, %ymm1
+; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero,ymm10[27],zero,ymm10[25]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[23,u,u,u],zero,ymm2[26],zero,ymm2[24,u,u,u],zero,ymm2[27],zero
+; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero
+; AVX512-NEXT: vmovdqa64 %ymm16, %ymm1
+; AVX512-NEXT: vpshufb %ymm1, %ymm7, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm15[23],zero,ymm15[23,24,25,26],zero,ymm15[24],zero,ymm15[30,31]
+; AVX512-NEXT: vmovdqa64 %ymm17, %ymm1
+; AVX512-NEXT: vpshufb %ymm1, %ymm15, %ymm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-NEXT: vmovdqa (%rax), %ymm4
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm16
; AVX512-NEXT: vmovdqa 32(%rcx), %xmm13
-; AVX512-NEXT: vpshufb %xmm11, %xmm13, %xmm2
-; AVX512-NEXT: vmovdqa 32(%rdx), %xmm10
-; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0
-; AVX512-NEXT: vporq %xmm2, %xmm0, %xmm26
-; AVX512-NEXT: vmovdqa 32(%rsi), %xmm11
-; AVX512-NEXT: vpshufb %xmm12, %xmm11, %xmm0
-; AVX512-NEXT: vmovdqa 32(%rdi), %xmm8
-; AVX512-NEXT: vpshufb %xmm14, %xmm8, %xmm2
-; AVX512-NEXT: vporq %xmm0, %xmm2, %xmm24
+; AVX512-NEXT: vpshufb %xmm6, %xmm13, %xmm0
+; AVX512-NEXT: vmovdqa 32(%rdx), %xmm11
+; AVX512-NEXT: vpshufb %xmm14, %xmm11, %xmm2
+; AVX512-NEXT: vporq %xmm0, %xmm2, %xmm25
+; AVX512-NEXT: vmovdqa 32(%rsi), %xmm12
+; AVX512-NEXT: vpshufb %xmm3, %xmm12, %xmm0
+; AVX512-NEXT: vmovdqa 32(%rdi), %xmm9
+; AVX512-NEXT: vmovdqa64 %xmm19, %xmm1
+; AVX512-NEXT: vpshufb %xmm1, %xmm9, %xmm2
+; AVX512-NEXT: vporq %xmm0, %xmm2, %xmm23
; AVX512-NEXT: vmovdqa 32(%rax), %xmm1
-; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6]
; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm2
-; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm25
-; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm20
-; AVX512-NEXT: vmovdqa 32(%r9), %xmm15
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm15[4,u,u,u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6]
+; AVX512-NEXT: vmovdqa64 %xmm3, %xmm26
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm24
+; AVX512-NEXT: vpermi2d %zmm0, %zmm2, %zmm19
+; AVX512-NEXT: vmovdqa 32(%r9), %xmm6
+; AVX512-NEXT: vpshufb %xmm5, %xmm6, %xmm0
; AVX512-NEXT: vmovdqa 32(%r8), %xmm5
; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[4],zero,xmm5[u,u,u,7],zero,xmm5[5],zero,xmm5[u,u,u,8],zero,xmm5[6],zero
-; AVX512-NEXT: vporq %xmm0, %xmm3, %xmm22
+; AVX512-NEXT: vporq %xmm0, %xmm3, %xmm20
; AVX512-NEXT: vmovdqa (%rax), %xmm2
; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,6]
; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512-NEXT: vpshufb %ymm9, %ymm4, %ymm6
-; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX512-NEXT: vmovdqa64 %xmm0, %xmm21
+; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
+; AVX512-NEXT: vpshufb %ymm10, %ymm4, %ymm7
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512-NEXT: vpshufb %xmm15, %xmm7, %xmm7
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm6[0,1,0,1]
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[2,3,2,3],zmm7[0,1,0,1]
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512-NEXT: vpshufb %xmm14, %xmm7, %xmm7
+; AVX512-NEXT: vpshufb %xmm14, %xmm8, %xmm8
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[2,3,2,3],zmm7[0,1,0,1]
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7]
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm0[2,3,2,3],zmm8[0,1,0,1]
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[2,3,2,3],zmm0[0,1,0,1]
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm21
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm0[0,1,0,1]
; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
; AVX512-NEXT: vmovdqa64 %ymm0, %ymm17
; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm9
-; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2]
-; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm9
-; AVX512-NEXT: vmovdqa64 %xmm28, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm16, %xmm0
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; AVX512-NEXT: vmovdqa64 %xmm12, %xmm16
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm0
-; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm28 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm28 = zmm4[0,1,0,1],mem[0,1,0,1]
-; AVX512-NEXT: vmovdqa64 %xmm30, %xmm0
-; AVX512-NEXT: vmovdqa64 %xmm31, %xmm12
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm0[8],xmm12[9],xmm0[9],xmm12[10],xmm0[10],xmm12[11],xmm0[11],xmm12[12],xmm0[12],xmm12[13],xmm0[13],xmm12[14],xmm0[14],xmm12[15],xmm0[15]
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
-; AVX512-NEXT: vpshufb %xmm14, %xmm12, %xmm12
-; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (zmm14 & (zmm7 ^ zmm6))
-; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512-NEXT: # ymm6 = mem[2,3,2,3]
+; AVX512-NEXT: vpshufb %ymm10, %ymm0, %ymm4
+; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2]
+; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10
+; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm4, %zmm4
+; AVX512-NEXT: vmovdqa64 %xmm27, %xmm1
+; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512-NEXT: vpshufb %xmm15, %xmm0, %xmm0
+; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm27 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm27 = zmm0[0,1,0,1],mem[0,1,0,1]
+; AVX512-NEXT: vmovdqa64 %xmm30, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm31, %xmm15
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
+; AVX512-NEXT: vpshufb %xmm14, %xmm15, %xmm14
+; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm7 ^ (zmm15 & (zmm8 ^ zmm7))
+; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512-NEXT: # ymm7 = mem[2,3,2,3]
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm31, %zmm6
+; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm31, %zmm7
; AVX512-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload
; AVX512-NEXT: # ymm31 = mem[2,3,2,3]
; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
; AVX512-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm31
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm6 ^ (zmm14 & (zmm31 ^ zmm6))
-; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm6 = zmm12[0,1,0,1],mem[0,1,0,1]
-; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512-NEXT: vmovdqa64 %xmm23, %xmm14
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm14
-; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm23 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm23 = zmm14[0,1,0,1],mem[0,1,0,1]
-; AVX512-NEXT: vmovdqa64 %xmm27, %xmm0
-; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm14
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm31 = zmm7 ^ (zmm15 & (zmm31 ^ zmm7))
+; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm7 = zmm14[0,1,0,1],mem[0,1,0,1]
+; AVX512-NEXT: vmovdqa64 %xmm28, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm22, %xmm15
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7]
+; AVX512-NEXT: vmovdqa64 %xmm21, %xmm1
+; AVX512-NEXT: vpshufb %xmm1, %xmm15, %xmm15
+; AVX512-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm22 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm22 = zmm15[0,1,0,1],mem[0,1,0,1]
+; AVX512-NEXT: vmovdqa64 %xmm26, %xmm1
+; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm15
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,1,0,0,4,5,6,7]
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15]
+; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm28
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm13 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512-NEXT: vpshufb %xmm13, %xmm4, %xmm4
-; AVX512-NEXT: vpshufb %xmm13, %xmm10, %xmm10
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm26[0,1,0,1],zmm10[0,1,0,1]
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512-NEXT: vmovdqa64 %xmm16, %xmm0
-; AVX512-NEXT: vpshufb %xmm11, %xmm0, %xmm0
-; AVX512-NEXT: vpshufb %xmm11, %xmm8, %xmm8
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm24[0,1,0,1],zmm8[0,1,0,1]
-; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm11
-; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm22[0,1,0,1],zmm2[0,1,0,1]
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3]
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm12 = zmm12[2,3,2,3],mem[2,3,2,3]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm5))
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT: vpshufb %xmm13, %xmm0, %xmm0
+; AVX512-NEXT: vpshufb %xmm13, %xmm11, %xmm11
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm25[0,1,0,1],zmm11[0,1,0,1]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512-NEXT: vpshufb %xmm12, %xmm10, %xmm10
+; AVX512-NEXT: vpshufb %xmm12, %xmm9, %xmm9
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm23[0,1,0,1],zmm9[0,1,0,1]
+; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512-NEXT: vpshufb %xmm6, %xmm14, %xmm12
+; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm20[0,1,0,1],zmm2[0,1,0,1]
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm6 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm13 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm14 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm13 ^ (mem & (zmm14 ^ zmm13))
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12))
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm12 = zmm12[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm6 ^ (zmm13 & (zmm14 ^ zmm6))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm6 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1]
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
-; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
-; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm17[2,3,2,3]
-; AVX512-NEXT: vmovdqa64 %xmm25, %xmm15
-; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,1,0,0,4,5,6,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm17 = zmm17[2,3,2,3],mem[2,3,2,3]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm12 ^ (zmm13 & (zmm17 ^ zmm12))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm10 ^ (zmm13 & (zmm8 ^ zmm10))
+; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm17[2,3,2,3]
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm17 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm17 = zmm5[2,3,2,3],mem[2,3,2,3]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm13 & (zmm17 ^ zmm6))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm11 ^ (zmm13 & (zmm9 ^ zmm11))
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm6 # 32-byte Folded Reload
; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm0))
-; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm0 & mem)
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm4))
-; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,1,0]
-; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm19))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (mem & (zmm29 ^ zmm5))
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512-NEXT: # zmm4 = zmm4[0,1,2,3],mem[2,3,2,3]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm4 & mem)
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm31))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm28))
-; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,0,4,4,5,4]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm23))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6))
-; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7]
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm4))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm17))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm20 ^ (mem & (zmm2 ^ zmm20))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm8))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm6))
+; AVX512-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 32-byte Folded Reload
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm6 & mem)
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0))
+; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,1,0]
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm18))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm8))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (mem & (zmm29 ^ zmm14))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512-NEXT: # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & mem)
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm31))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm27))
+; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm28[0,0,1,0,4,4,5,4]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm22))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm7))
+; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
+; AVX512-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,3,2,3,6,7,6,7]
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm6))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm17))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm19 ^ (mem & (zmm2 ^ zmm19))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm9))
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rax)
-; AVX512-NEXT: vmovdqa64 %zmm18, 128(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm16, 128(%rax)
; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512-NEXT: vmovdqa64 %zmm9, 320(%rax)
+; AVX512-NEXT: vmovdqa64 %zmm4, 320(%rax)
; AVX512-NEXT: vmovdqa64 %zmm29, 384(%rax)
; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax)
; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax)
@@ -8785,382 +8784,374 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512-FCP-LABEL: store_i8_stride7_vf64:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: subq $1416, %rsp # imm = 0x588
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm12
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero,zero,zero,zero,zero,ymm12[18]
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm9
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero
+; AVX512-FCP-NEXT: subq $1480, %rsp # imm = 0x5C8
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm9
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero,zero,zero,zero,zero,ymm9[18]
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[0,1,14],zero,ymm6[12,13,0,1,14,15],zero,ymm6[3,12,13,2,3,16],zero,ymm6[30,31,28,29,16,17],zero,ymm6[31,18,19,28,29,18],zero
; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm4
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,1,0,1,14],zero,ymm4[14,15,0,1,14,15],zero,ymm4[13,14,15,16,17,16],zero,ymm4[30,31,30,31,16,17],zero,ymm4[31,28,29,30,31]
-; AVX512-FCP-NEXT: vpor %ymm1, %ymm3, %ymm0
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm4
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm3
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[0,1,0,1,14],zero,ymm3[14,15,0,1,14,15],zero,ymm3[13,14,15,16,17,16],zero,ymm3[30,31,30,31,16,17],zero,ymm3[31,28,29,30,31]
+; AVX512-FCP-NEXT: vpor %ymm1, %ymm5, %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm8
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero
-; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm11
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[13,u,u,u,u,u],zero,ymm11[14,u,u,u,u,u],zero,ymm11[15,u,u,u,u,u],zero,ymm11[16,u,u,u,u,u],zero,ymm11[17,u,u,u]
-; AVX512-FCP-NEXT: vpor %ymm3, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm15
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero
+; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm12
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm12[13,u,u,u,u,u],zero,ymm12[14,u,u,u,u,u],zero,ymm12[15,u,u,u,u,u],zero,ymm12[16,u,u,u,u,u],zero,ymm12[17,u,u,u]
+; AVX512-FCP-NEXT: vpor %ymm1, %ymm5, %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm0
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
-; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm15
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm2
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm5
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512-FCP-NEXT: vpor %ymm3, %ymm5, %ymm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
-; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm5
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm25
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm19
-; AVX512-FCP-NEXT: vpor %ymm3, %ymm5, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
+; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm7
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
+; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm23
+; AVX512-FCP-NEXT: vpor %ymm1, %ymm5, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm13
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm5
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm1
; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm24
-; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm3
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm18
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
-; AVX512-FCP-NEXT: vpor %ymm5, %ymm7, %ymm0
-; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm5
-; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm26
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm7
-; AVX512-FCP-NEXT: vpor %ymm5, %ymm7, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm20
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm16
+; AVX512-FCP-NEXT: vpor %ymm1, %ymm5, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm14
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm13
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm5
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm10
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
+; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm11
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm23
-; AVX512-FCP-NEXT: vpor %ymm5, %ymm7, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm5
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
+; AVX512-FCP-NEXT: vpor %ymm1, %ymm5, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
+; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm7
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm8
+; AVX512-FCP-NEXT: vpor %ymm7, %ymm8, %ymm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm8
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm27
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm14
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm26
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm14, %ymm0
+; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm7[7],zero,zero,zero,zero,ymm7[10],zero,ymm7[8],zero,zero,zero,zero,ymm7[11],zero,ymm7[9],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero,ymm7[25]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[9],zero,ymm8[7,u,u,u],zero,ymm8[10],zero,ymm8[8,u,u,u],zero,ymm8[11],zero,ymm8[25],zero,ymm8[23,u,u,u],zero,ymm8[26],zero,ymm8[24,u,u,u],zero,ymm8[27],zero
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm14, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm0
; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512-FCP-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm1
-; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm16
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[2,3,2,3],zmm1[2,3,2,3]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm19
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm14[2,3,2,3],zmm1[2,3,2,3]
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm7
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm1
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero,zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm9[19],zero,ymm9[21,20,21,22],zero,ymm9[20],zero,ymm9[22,23]
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[18],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm7
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[21],zero,ymm9[19],zero,zero,zero,zero,ymm9[22],zero,ymm9[20],zero,zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm6[19],zero,ymm6[21,20,21,22],zero,ymm6[20],zero,ymm6[22,23]
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm26
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm7
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm20
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero,ymm6[25]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm9[23,u,u,u],zero,ymm9[26],zero,ymm9[24,u,u,u],zero,ymm9[27],zero
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[20],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[20],zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22]
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm8[18],zero,ymm8[20,21,20,21],zero,ymm8[19],zero,ymm8[19,20,21,22],zero
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0
+; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512-FCP-NEXT: vporq %ymm0, %ymm1, %ymm29
+; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm5
-; AVX512-FCP-NEXT: vporq %ymm0, %ymm5, %ymm30
-; AVX512-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm2
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm4
-; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0]
+; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm3
+; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0]
; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm2
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[1,1,0,0,4,5,6,7]
-; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm0, %zmm29
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,1,0,0,4,5,6,7]
+; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm31
; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0
-; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0
; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm1
; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm5
-; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm1
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm24
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,7],zero,xmm5[5],zero,xmm5[u,u,u,8],zero,xmm5[6],zero,xmm5[u,u]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm5, %xmm23
+; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm4
+; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm14
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,7],zero,xmm4[5],zero,xmm4[u,u,u,8],zero,xmm4[6],zero,xmm4[u,u]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm28
; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm7
-; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm12
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm6
-; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm21
-; AVX512-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,6]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm16
+; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
+; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm1
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm4
+; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm27
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm5
+; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm23
+; AVX512-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,6]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
-; AVX512-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm25
-; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm11
+; AVX512-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm25
+; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm4
; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm5
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2
+; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm22
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
+; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm4
; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm17
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm6
-; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm18
-; AVX512-FCP-NEXT: vporq %xmm2, %xmm6, %xmm31
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[0,1,0,1,14],zero,ymm13[14,15,0,1,14,15],zero,ymm13[13,14,15,16,17,16],zero,ymm13[30,31,30,31,16,17],zero,ymm13[31,28,29,30,31]
-; AVX512-FCP-NEXT: vpor %ymm2, %ymm6, %ymm2
+; AVX512-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2
+; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm11[14],zero,zero,zero,zero,zero,zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,zero,zero,zero,zero,zero,ymm11[17],zero,zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[0,1,0,1,14],zero,ymm13[14,15,0,1,14,15],zero,ymm13[13,14,15,16,17,16],zero,ymm13[30,31,30,31,16,17],zero,ymm13[31,28,29,30,31]
+; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm2
-; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm3
-; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm3
-; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22],zero,ymm11[20]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm13[18,19,20,21],zero,ymm13[19],zero,ymm13[25,26,27,22],zero,ymm13[20],zero
+; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[0,1,14],zero,ymm15[12,13,0,1,14,15],zero,ymm15[3,12,13,2,3,16],zero,ymm15[30,31,28,29,16,17],zero,ymm15[31,18,19,28,29,18],zero
-; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero,zero,ymm8[18]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[0,1,14],zero,ymm7[12,13,0,1,14,15],zero,ymm7[3,12,13,2,3,16],zero,ymm7[30,31,28,29,16,17],zero,ymm7[31,18,19,28,29,18],zero
+; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm15[19],zero,ymm15[21,20,21,22],zero,ymm15[20],zero,ymm15[22,23]
-; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22],zero,ymm8[20],zero,zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm7[19],zero,ymm7[21,20,21,22],zero,ymm7[20],zero,ymm7[22,23]
+; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[20],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero
-; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22]
+; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm7
+; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm4
+; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm4
+; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm14[14],zero,zero,zero,zero,zero,zero,ymm14[15],zero,zero,zero,zero,zero,zero,ymm14[16],zero,zero,zero,zero,zero,zero,ymm14[17],zero,zero,zero,zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[13,u,u,u,u,u],zero,ymm10[14,u,u,u,u,u],zero,ymm10[15,u,u,u,u,u],zero,ymm10[16,u,u,u,u,u],zero,ymm10[17,u,u,u]
-; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u]
+; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm10
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9
-; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0
-; AVX512-FCP-NEXT: vporq %xmm1, %xmm0, %xmm20
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm13
-; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm0
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u]
-; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm19
-; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm1
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
+; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm19
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm8
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1
+; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm21
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm12
+; AVX512-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm0
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,7],zero,xmm6[5],zero,xmm6[u,u,u,8],zero,xmm6[6],zero,xmm6[u,u]
+; AVX512-FCP-NEXT: vporq %xmm0, %xmm1, %xmm20
+; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm9
+; AVX512-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm0
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[1,1,0,0,4,5,6,7]
-; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm16
-; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm7
-; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm1
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm3
-; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm6
-; AVX512-FCP-NEXT: vmovdqa64 %xmm18, %xmm1
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm15
-; AVX512-FCP-NEXT: vporq %xmm3, %xmm15, %xmm18
-; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
-; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3]
-; AVX512-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
-; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3
-; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm28
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[1,1,0,0,4,5,6,7]
+; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm16
+; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm11
+; AVX512-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
+; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm7
+; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm13
+; AVX512-FCP-NEXT: vporq %xmm10, %xmm13, %xmm18
+; AVX512-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6]
+; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3]
+; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9
+; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
+; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm10
+; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm30
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm13
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [4,5,4,5,5,7,4,5,20,21,22,23,20,21,22,23]
-; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm0
-; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm3
-; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm4
-; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm27
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm30[2,3,2,3],zmm4[0,1,0,1]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm1
-; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm15
-; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[2,3,2,3],zmm4[0,1,0,1]
-; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm26
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm2[2,3,2,3],zmm4[0,1,0,1]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm15
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm2[0,1,0,1],zmm1[0,1,0,1]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm12
-; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm17, %zmm15
+; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm9
+; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4
+; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm27
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm29[2,3,2,3],zmm3[0,1,0,1]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm0
+; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm24
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm1[2,3,2,3],zmm3[0,1,0,1]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm2
+; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm26
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[2,3,2,3],zmm3[0,1,0,1]
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm1
+; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm22
+; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm4
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm14
+; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15]
+; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm2[0,1,0,1],zmm1[0,1,0,1]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
-; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm11
-; AVX512-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm31[0,1,0,1],zmm1[0,1,0,1]
-; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm1
-; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512-FCP-NEXT: vpermd %ymm2, %ymm17, %ymm2
-; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm20[0,1,0,1]
-; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm19[0,1,0,1]
-; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm3
-; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6
-; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,0,1],zmm18[0,1,0,1]
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm2[0,1,0,1],zmm1[0,1,0,1]
+; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm1
+; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1
+; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512-FCP-NEXT: vpermd %ymm9, %ymm17, %ymm9
+; AVX512-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm8
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm21[0,1,0,1]
+; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm2
+; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm6
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,0,1],zmm20[0,1,0,1]
+; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm4
+; AVX512-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm18[0,1,0,1]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3]
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
; AVX512-FCP-NEXT: # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm9 = zmm9[2,3,2,3],mem[2,3,2,3]
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm7 ^ (zmm10 & (zmm9 ^ zmm7))
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm9))
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm9 & (zmm7 ^ zmm4))
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm4))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7))
; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax)
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3]
+; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax)
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3]
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
; AVX512-FCP-NEXT: # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm12[0,1,0,1]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,1,0,1]
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm0))
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512-FCP-NEXT: # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm10 & (zmm0 ^ zmm7))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm24 ^ (zmm10 & (zmm21 ^ zmm24))
-; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm30 ^ (zmm7 & (zmm23 ^ zmm30))
-; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm10 = mem[2,3,2,3]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm10
-; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: # ymm13 = mem[2,3,2,3]
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm13
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm10 ^ (zmm7 & (zmm13 ^ zmm10))
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm12, %zmm9 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm7))
-; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 32-byte Folded Reload
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm9))
-; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm29 ^ (mem & (zmm4 ^ zmm29))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm23))
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm14[0,1,0,1]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm22[0,1,0,1]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload
+; AVX512-FCP-NEXT: # zmm14 = zmm14[2,3,2,3],mem[2,3,2,3]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm7 ^ (mem & (zmm14 ^ zmm7))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm4 ^ (zmm9 & (zmm14 ^ zmm4))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm23 ^ (zmm9 & (zmm10 ^ zmm23))
+; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm29 ^ (zmm4 & (zmm28 ^ zmm29))
+; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm7 = mem[2,3,2,3]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7
+; AVX512-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: # ymm9 = mem[2,3,2,3]
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm15, %zmm9
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm7 ^ (zmm4 & (zmm9 ^ zmm7))
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm4 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm12, %zmm7 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm4))
+; AVX512-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 | (zmm0 & mem)
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm7))
+; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm14))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm31 ^ (mem & (zmm3 ^ zmm31))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm28))
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm25 ^ (mem & (zmm5 ^ zmm25))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm21))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm10))
; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
; AVX512-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[2,3,2,3]
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm13))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2))
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm16 ^ (mem & (zmm6 ^ zmm16))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm9))
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm8))
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rax)
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm16 ^ (mem & (zmm2 ^ zmm16))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm6))
+; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax)
; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 384(%rax)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax)
-; AVX512-FCP-NEXT: addq $1416, %rsp # imm = 0x588
+; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 384(%rax)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rax)
+; AVX512-FCP-NEXT: addq $1480, %rsp # imm = 0x5C8
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -9170,379 +9161,374 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm23
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm22
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm10
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm10, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm16
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm17
; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm8
+; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm9
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm8, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm7
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm7, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm9, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm8
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512DQ-NEXT: vpshufb %ymm12, %ymm8, %ymm1
; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%r8), %ymm6
+; AVX512DQ-NEXT: vmovdqa (%r8), %ymm15
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm6, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm24
-; AVX512DQ-NEXT: vmovdqa (%r9), %ymm5
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
-; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm25
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm15, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19
+; AVX512DQ-NEXT: vmovdqa (%r9), %ymm7
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0,13,0,0,0,128,16,128,14,0,0,0,128,17,128,15,0]
+; AVX512DQ-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm5, %ymm7, %ymm1
; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
-; AVX512DQ-NEXT: # ymm14 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm14, %ymm3, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm17
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm10, %ymm2
-; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
-; AVX512DQ-NEXT: # ymm15 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm15, %ymm7, %ymm1
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
-; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm8, %ymm3
-; AVX512DQ-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
-; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm6, %ymm1
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
-; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm5, %ymm13
-; AVX512DQ-NEXT: vpor %ymm1, %ymm13, %ymm1
-; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm13
-; AVX512DQ-NEXT: vpshufb %ymm14, %ymm13, %ymm1
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm14
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm14, %ymm0
-; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero,zero,zero,ymm14[27],zero,ymm14[25]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm13[23,u,u,u],zero,ymm13[26],zero,ymm13[24,u,u,u],zero,ymm13[27],zero
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
+; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm3, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm23
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [12,13,14,128,12,128,14,15,14,15,128,13,128,15,12,13,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512DQ-NEXT: vpshufb %ymm14, %ymm10, %ymm1
; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm15, %ymm1, %ymm15
-; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm0
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm2
-; AVX512DQ-NEXT: vpor %ymm2, %ymm15, %ymm2
-; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512DQ-NEXT: # ymm12 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm12, %ymm1, %ymm15
-; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm22
-; AVX512DQ-NEXT: vpor %ymm2, %ymm15, %ymm2
-; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm15
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm15, %ymm3
-; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm2
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm9
-; AVX512DQ-NEXT: vpor %ymm3, %ymm9, %ymm3
-; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm2, %ymm3
-; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm21
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm15[23],zero,ymm15[23,24,25,26],zero,ymm15[24],zero,ymm15[30,31]
-; AVX512DQ-NEXT: vpor %ymm3, %ymm9, %ymm3
-; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm9
-; AVX512DQ-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
+; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm8, %ymm0
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm9, %ymm12
-; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm20
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm12[2,3,2,3],zmm3[2,3,2,3]
-; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm3
-; AVX512DQ-NEXT: vpshufb %ymm11, %ymm1, %ymm11
-; AVX512DQ-NEXT: vpor %ymm3, %ymm11, %ymm3
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm9, %ymm3
+; AVX512DQ-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512DQ-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm15, %ymm3
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
+; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm7, %ymm6
+; AVX512DQ-NEXT: vpor %ymm3, %ymm6, %ymm3
; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm0, %ymm0
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512DQ-NEXT: # ymm9 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm0
+; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm13
; AVX512DQ-NEXT: vpshufb %ymm0, %ymm13, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm14, %ymm1
+; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm6
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm13[23],zero,ymm13[23,24,25,26],zero,ymm13[24],zero,ymm13[30,31]
+; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm3
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm3, %ymm0
+; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm2
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm1
; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm13, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm26
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512DQ-NEXT: vpshufb %ymm13, %ymm14, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm21
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
+; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm20
; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm1, %ymm4
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-NEXT: vpshufb %ymm14, %ymm0, %ymm11
+; AVX512DQ-NEXT: vpor %ymm4, %ymm11, %ymm4
+; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm0[7],zero,zero,zero,zero,ymm0[10],zero,ymm0[8],zero,zero,zero,zero,ymm0[11],zero,ymm0[9],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27],zero,ymm0[25]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm1[9],zero,ymm1[7,u,u,u],zero,ymm1[10],zero,ymm1[8,u,u,u],zero,ymm1[11],zero,ymm1[25],zero,ymm1[23,u,u,u],zero,ymm1[26],zero,ymm1[24,u,u,u],zero,ymm1[27],zero
+; AVX512DQ-NEXT: vpor %ymm4, %ymm11, %ymm4
+; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm4
+; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm14[2,3,2,3],zmm11[2,3,2,3]
+; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm4
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm11
+; AVX512DQ-NEXT: vpshufb %ymm12, %ymm3, %ymm12
+; AVX512DQ-NEXT: vpor %ymm11, %ymm12, %ymm4
+; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
+; AVX512DQ-NEXT: # ymm11 = mem[0,1,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm2, %ymm2
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
+; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm18
+; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm2
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm2
+; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm3
+; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm3
+; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
+; AVX512DQ-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm1, %ymm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm12 = [2,3,4,5,128,3,128,5,4,5,6,128,4,128,6,7,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
+; AVX512DQ-NEXT: vpshufb %ymm12, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
; AVX512DQ-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpshufb %ymm3, %ymm15, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm19
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm6, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm16
+; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-NEXT: vpshufb %ymm2, %ymm13, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm17
; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm0
-; AVX512DQ-NEXT: vpshufb %ymm0, %ymm15, %ymm0
-; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm13, %ymm0
+; AVX512DQ-NEXT: vpshufb %ymm5, %ymm6, %ymm1
; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2
-; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm1
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm1, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm16
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0
+; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm19
+; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm27
+; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm1
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm31
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
; AVX512DQ-NEXT: vpshufb %xmm14, %xmm2, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm28
+; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm30
; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3
-; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm1
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm31
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm3, %xmm2
-; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm30
-; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-NEXT: vmovdqa (%r9), %xmm1
-; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[4,u,u,u],zero,xmm1[7],zero,xmm1[5,u,u,u],zero,xmm1[8],zero,xmm1[6]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[4],zero,xmm3[u,u,u,7],zero,xmm3[5],zero,xmm3[u,u,u,8],zero,xmm3[6],zero
-; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm23
-; AVX512DQ-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero,ymm10[27],zero,ymm10[25]
-; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23,u,u,u],zero,ymm3[26],zero,ymm3[24,u,u,u],zero,ymm3[27],zero
-; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm13, %ymm10, %ymm2
-; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero,zero,zero
-; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm2
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm7, %ymm2
-; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufb %ymm4, %ymm8, %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm7, %ymm2
-; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm5, %ymm1
-; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm2
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm5, %ymm2
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
-; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm2
-; AVX512DQ-NEXT: vpshufb %ymm2, %ymm6, %ymm2
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-NEXT: vmovdqa (%rax), %ymm4
+; AVX512DQ-NEXT: vmovdqa (%r8), %xmm2
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm1, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm28
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4],zero,xmm2[u,u,u,7],zero,xmm2[5],zero,xmm2[u,u,u,8],zero,xmm2[6],zero
+; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm22
+; AVX512DQ-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm0
+; AVX512DQ-NEXT: vpshufb %ymm0, %ymm9, %ymm0
; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm1
-; AVX512DQ-NEXT: vpshufb %ymm1, %ymm4, %ymm2
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm18
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm8, %ymm1
+; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpshufb %ymm11, %ymm9, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm8, %ymm1
+; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm2
+; AVX512DQ-NEXT: vpshufb %ymm4, %ymm2, %ymm0
+; AVX512DQ-NEXT: vpshufb %ymm12, %ymm10, %ymm1
+; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero,ymm10[27],zero,ymm10[25]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[23,u,u,u],zero,ymm2[26],zero,ymm2[24,u,u,u],zero,ymm2[27],zero
+; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm7[25],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero
+; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm7, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm15[23],zero,ymm15[23,24,25,26],zero,ymm15[24],zero,ymm15[30,31]
+; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm1
+; AVX512DQ-NEXT: vpshufb %ymm1, %ymm15, %ymm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-NEXT: vmovdqa (%rax), %ymm4
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,5,4,0,5,0,4,0,20,21,0,23,0,21,0,23]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm16
; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm13
-; AVX512DQ-NEXT: vpshufb %xmm11, %xmm13, %xmm2
-; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm10
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0
-; AVX512DQ-NEXT: vporq %xmm2, %xmm0, %xmm26
-; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm11
-; AVX512DQ-NEXT: vpshufb %xmm12, %xmm11, %xmm0
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm8
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm8, %xmm2
-; AVX512DQ-NEXT: vporq %xmm0, %xmm2, %xmm24
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm13, %xmm0
+; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm11
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm11, %xmm2
+; AVX512DQ-NEXT: vporq %xmm0, %xmm2, %xmm25
+; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm12
+; AVX512DQ-NEXT: vpshufb %xmm3, %xmm12, %xmm0
+; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm9
+; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm9, %xmm2
+; AVX512DQ-NEXT: vporq %xmm0, %xmm2, %xmm23
; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm1
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
+; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,5,6]
; AVX512DQ-NEXT: vpshufb %xmm3, %xmm1, %xmm2
-; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27
-; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm25
-; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm20
-; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm15
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm15[4,u,u,u],zero,xmm15[7],zero,xmm15[5,u,u,u],zero,xmm15[8],zero,xmm15[6]
+; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm26
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm24
+; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm2, %zmm19
+; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm6
+; AVX512DQ-NEXT: vpshufb %xmm5, %xmm6, %xmm0
; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm5
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm5[4],zero,xmm5[u,u,u,7],zero,xmm5[5],zero,xmm5[u,u,u,8],zero,xmm5[6],zero
-; AVX512DQ-NEXT: vporq %xmm0, %xmm3, %xmm22
+; AVX512DQ-NEXT: vporq %xmm0, %xmm3, %xmm20
; AVX512DQ-NEXT: vmovdqa (%rax), %xmm2
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,6]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm4, %ymm6
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm3, %zmm3
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm6, %xmm6
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm21
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
+; AVX512DQ-NEXT: vpshufb %ymm10, %ymm4, %ymm7
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm3, %zmm3
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm15 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512DQ-NEXT: vpshufb %xmm15, %xmm7, %xmm7
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm0[2,3,2,3],zmm6[0,1,0,1]
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3],xmm10[4],xmm13[4],xmm10[5],xmm13[5],xmm10[6],xmm13[6],xmm10[7],xmm13[7]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[2,3,2,3],zmm7[0,1,0,1]
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm11[0],xmm13[0],xmm11[1],xmm13[1],xmm11[2],xmm13[2],xmm11[3],xmm13[3],xmm11[4],xmm13[4],xmm11[5],xmm13[5],xmm11[6],xmm13[6],xmm11[7],xmm13[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm7, %xmm7
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm8, %xmm8
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm0[2,3,2,3],zmm7[0,1,0,1]
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3],xmm5[4],xmm15[4],xmm5[5],xmm15[5],xmm5[6],xmm15[6],xmm5[7],xmm15[7]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm0[2,3,2,3],zmm8[0,1,0,1]
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm19[2,3,2,3],zmm0[0,1,0,1]
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm21
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm0[0,1,0,1]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm17
; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX512DQ-NEXT: vpshufb %ymm9, %ymm0, %ymm9
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,2]
-; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm9, %zmm9
-; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm0
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm16
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm4
-; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm28 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm28 = zmm4[0,1,0,1],mem[0,1,0,1]
-; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm12
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm12[8],xmm0[8],xmm12[9],xmm0[9],xmm12[10],xmm0[10],xmm12[11],xmm0[11],xmm12[12],xmm0[12],xmm12[13],xmm0[13],xmm12[14],xmm0[14],xmm12[15],xmm0[15]
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7]
-; AVX512DQ-NEXT: vpshufb %xmm14, %xmm12, %xmm12
-; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm6 ^ (zmm14 & (zmm7 ^ zmm6))
-; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
-; AVX512DQ-NEXT: # ymm6 = mem[2,3,2,3]
+; AVX512DQ-NEXT: vpshufb %ymm10, %ymm0, %ymm4
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm0[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,1,1,3,4,5,5,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,3,2]
+; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm10, %ymm10
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm4, %zmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm1
+; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX512DQ-NEXT: vpshufb %xmm15, %xmm0, %xmm0
+; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm27 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm27 = zmm0[0,1,0,1],mem[0,1,0,1]
+; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm15
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
+; AVX512DQ-NEXT: vpshufb %xmm14, %xmm15, %xmm14
+; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm15 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm7 ^ (zmm15 & (zmm8 ^ zmm7))
+; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-NEXT: # ymm7 = mem[2,3,2,3]
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm31, %zmm6
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm31, %zmm7
; AVX512DQ-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm31 # 32-byte Folded Reload
; AVX512DQ-NEXT: # ymm31 = mem[2,3,2,3]
; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm31, %zmm30, %zmm31
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm6 ^ (zmm14 & (zmm31 ^ zmm6))
-; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm6 = zmm12[0,1,0,1],mem[0,1,0,1]
-; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm14
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; AVX512DQ-NEXT: vpshufb %xmm1, %xmm14, %xmm14
-; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm23 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm23 = zmm14[0,1,0,1],mem[0,1,0,1]
-; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm14
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm31 = zmm7 ^ (zmm15 & (zmm31 ^ zmm7))
+; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm7 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm7 = zmm14[0,1,0,1],mem[0,1,0,1]
+; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm1
+; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm15
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3],xmm15[4],xmm1[4],xmm15[5],xmm1[5],xmm15[6],xmm1[6],xmm15[7],xmm1[7]
+; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm15, %xmm15
+; AVX512DQ-NEXT: vshufi64x2 $0, {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm22 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm22 = zmm15[0,1,0,1],mem[0,1,0,1]
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm1
+; AVX512DQ-NEXT: vpshufb %xmm1, %xmm2, %xmm15
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,1,0,0,4,5,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm13[8],xmm10[8],xmm13[9],xmm10[9],xmm13[10],xmm10[10],xmm13[11],xmm10[11],xmm13[12],xmm10[12],xmm13[13],xmm10[13],xmm13[14],xmm10[14],xmm13[15],xmm10[15]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm1, %zmm28
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm13 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512DQ-NEXT: vpshufb %xmm13, %xmm4, %xmm4
-; AVX512DQ-NEXT: vpshufb %xmm13, %xmm10, %xmm10
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm26[0,1,0,1],zmm10[0,1,0,1]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm8[8],xmm11[9],xmm8[9],xmm11[10],xmm8[10],xmm11[11],xmm8[11],xmm11[12],xmm8[12],xmm11[13],xmm8[13],xmm11[14],xmm8[14],xmm11[15],xmm8[15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm11, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufb %xmm11, %xmm8, %xmm8
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm24[0,1,0,1],zmm8[0,1,0,1]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15]
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm5 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm12, %xmm11
-; AVX512DQ-NEXT: vpshufb %xmm5, %xmm2, %xmm2
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm22[0,1,0,1],zmm2[0,1,0,1]
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm12 = zmm12[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm5))
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
-; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm5 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm5 = zmm5[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm0, %xmm0
+; AVX512DQ-NEXT: vpshufb %xmm13, %xmm11, %xmm11
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm25[0,1,0,1],zmm11[0,1,0,1]
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm12 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm10, %xmm10
+; AVX512DQ-NEXT: vpshufb %xmm12, %xmm9, %xmm9
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm23[0,1,0,1],zmm9[0,1,0,1]
+; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
+; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm6 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm14, %xmm12
+; AVX512DQ-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm20[0,1,0,1],zmm2[0,1,0,1]
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm6 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm13 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm13 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm14 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm13 ^ (mem & (zmm14 ^ zmm13))
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm13 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (zmm13 & (zmm5 ^ zmm12))
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
-; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm12 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm12 = zmm12[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm6 ^ (zmm13 & (zmm14 ^ zmm6))
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm6 = zmm1[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm17[2,3,2,3]
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm15
-; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,1,0,0,4,5,6,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[0,1,2,0]
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
-; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm17, %zmm17 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm17 = zmm17[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm12 ^ (zmm13 & (zmm17 ^ zmm12))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm10 ^ (zmm13 & (zmm8 ^ zmm10))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm17[2,3,2,3]
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,0,0,4,5,6,7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm17 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm17 = zmm5[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm17 = zmm17 ^ (zmm13 & (zmm17 ^ zmm6))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm11 ^ (zmm13 & (zmm9 ^ zmm11))
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm6 # 32-byte Folded Reload
; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm0))
-; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm0 # 32-byte Folded Reload
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm0 & mem)
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm4))
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm15[0,0,1,0]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm14, %zmm0
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm19))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (mem & (zmm29 ^ zmm5))
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-NEXT: # zmm4 = zmm4[0,1,2,3],mem[2,3,2,3]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | (zmm4 & mem)
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm31))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm28))
-; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm1[0,0,1,0,4,4,5,4]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm23))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm6))
-; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
-; AVX512DQ-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
-; AVX512DQ-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm4))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm18 = zmm18 ^ (mem & (zmm18 ^ zmm17))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm20 ^ (mem & (zmm2 ^ zmm20))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm8))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm6))
+; AVX512DQ-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 32-byte Folded Reload
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 | (zmm6 & mem)
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0))
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm1[0,0,1,0]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm15, %zmm0
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm18))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm8))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm29 = zmm29 ^ (mem & (zmm29 ^ zmm14))
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm1 # 64-byte Folded Reload
+; AVX512DQ-NEXT: # zmm1 = zmm1[0,1,2,3],mem[2,3,2,3]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & mem)
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm31))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm27))
+; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm28[0,0,1,0,4,4,5,4]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm22))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm7))
+; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
+; AVX512DQ-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm5, %zmm6 # 64-byte Folded Reload
+; AVX512DQ-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,3,2,3,6,7,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm6))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm16 = zmm16 ^ (mem & (zmm16 ^ zmm17))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm19 ^ (mem & (zmm2 ^ zmm19))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm9))
; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm18, 128(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm16, 128(%rax)
; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax)
-; AVX512DQ-NEXT: vmovdqa64 %zmm9, 320(%rax)
+; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rax)
; AVX512DQ-NEXT: vmovdqa64 %zmm29, 384(%rax)
; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax)
; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax)
@@ -9552,1574 +9538,1506 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512DQ-FCP-LABEL: store_i8_stride7_vf64:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: subq $1416, %rsp # imm = 0x588
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm12
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero,zero,zero,zero,zero,ymm12[18]
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm9
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[0,1,14],zero,ymm9[12,13,0,1,14,15],zero,ymm9[3,12,13,2,3,16],zero,ymm9[30,31,28,29,16,17],zero,ymm9[31,18,19,28,29,18],zero
+; AVX512DQ-FCP-NEXT: subq $1480, %rsp # imm = 0x5C8
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm9
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm9[14],zero,zero,zero,zero,zero,zero,ymm9[15],zero,zero,zero,zero,zero,zero,ymm9[16],zero,zero,zero,zero,zero,zero,ymm9[17],zero,zero,zero,zero,zero,zero,ymm9[18]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[0,1,14],zero,ymm6[12,13,0,1,14,15],zero,ymm6[3,12,13,2,3,16],zero,ymm6[30,31,28,29,16,17],zero,ymm6[31,18,19,28,29,18],zero
; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm4
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[0,1,0,1,14],zero,ymm4[14,15,0,1,14,15],zero,ymm4[13,14,15,16,17,16],zero,ymm4[30,31,30,31,16,17],zero,ymm4[31,28,29,30,31]
-; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm3
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[0,1,0,1,14],zero,ymm3[14,15,0,1,14,15],zero,ymm3[13,14,15,16,17,16],zero,ymm3[30,31,30,31,16,17],zero,ymm3[31,28,29,30,31]
+; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm5, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm8
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm11
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[13,u,u,u,u,u],zero,ymm11[14,u,u,u,u,u],zero,ymm11[15,u,u,u,u,u],zero,ymm11[16,u,u,u,u,u],zero,ymm11[17,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm15
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,ymm15[14],zero,zero,zero,zero,zero,zero,ymm15[15],zero,zero,zero,zero,zero,zero,ymm15[16],zero,zero,zero,zero,zero,zero,ymm15[17],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm12
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm12[13,u,u,u,u,u],zero,ymm12[14,u,u,u,u,u],zero,ymm12[15,u,u,u,u,u],zero,ymm12[16,u,u,u,u,u],zero,ymm12[17,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm5, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm0
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
-; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
-; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm15
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm2
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
-; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm5, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128,25,128,23,0,0,0,128,26,128,24,0,0,0,128,27,128]
-; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm25
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm19
-; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm5, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm7
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
+; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23
+; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm5, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm13
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm7, %ymm1
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm24
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm3
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm18
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
-; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm20
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm16
+; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm5, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm26
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm13, %ymm7
-; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm7, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm14
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm13
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm5
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm10
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,128,27,0,0,0,128,30,128,28,0,0,0,128,31,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm17
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm11
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0,27,0,0,0,128,30,128,28,0,0,0,128,31,128,29,0]
; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm23
-; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
+; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm5, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
+; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm11, %ymm7
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm13, %ymm8
+; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm8
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
+; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm7
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm14, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm27
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm14
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm26
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm14, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,ymm7[7],zero,zero,zero,zero,ymm7[10],zero,ymm7[8],zero,zero,zero,zero,ymm7[11],zero,ymm7[9],zero,ymm7[23],zero,zero,zero,zero,ymm7[26],zero,ymm7[24],zero,zero,zero,zero,ymm7[27],zero,ymm7[25]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm8[9],zero,ymm8[7,u,u,u],zero,ymm8[10],zero,ymm8[8,u,u,u],zero,ymm8[11],zero,ymm8[25],zero,ymm8[23,u,u,u],zero,ymm8[26],zero,ymm8[24,u,u,u],zero,ymm8[27],zero
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm14, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
-; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm16
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm7[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
+; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm19
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm14[2,3,2,3],zmm1[2,3,2,3]
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm7
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm10, %ymm4, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm1
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22],zero,ymm12[20],zero,zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm9[19],zero,ymm9[21,20,21,22],zero,ymm9[20],zero,ymm9[22,23]
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[18],zero,zero,zero,zero,ymm4[21],zero,ymm4[19],zero,zero,zero,zero,ymm4[22],zero,ymm4[20]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm3[18,19,20,21],zero,ymm3[19],zero,ymm3[25,26,27,22],zero,ymm3[20],zero
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm6, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm7
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm9[21],zero,ymm9[19],zero,zero,zero,zero,ymm9[22],zero,ymm9[20],zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm6[19],zero,ymm6[21,20,21,22],zero,ymm6[20],zero,ymm6[22,23]
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm26
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm20
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm7, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero,ymm6[25]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm9[23,u,u,u],zero,ymm9[26],zero,ymm9[24,u,u,u],zero,ymm9[27],zero
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[20],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[20],zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22]
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm8[18],zero,ymm8[20,21,20,21],zero,ymm8[19],zero,ymm8[19,20,21,22],zero
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm5, %zmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
+; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm15, %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm20
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm1
+; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm1, %ymm29
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm5
-; AVX512DQ-FCP-NEXT: vporq %ymm0, %ymm5, %ymm30
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm17, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm6, %ymm2
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm29 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm0
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm31 = [4,0,6,0,4,0,6,7,0,17,0,17,0,16,16,0]
; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm2
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[1,1,0,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm0, %zmm29
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[1,1,0,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm31
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0
-; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm8, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm15, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm12, %ymm1
; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm1, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm24
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm5[u,u,u,7],zero,xmm5[5],zero,xmm5[u,u,u,8],zero,xmm5[6],zero,xmm5[u,u]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm5, %xmm23
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm14
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm14, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[u,u,u,7],zero,xmm4[5],zero,xmm4[u,u,u,8],zero,xmm4[6],zero,xmm4[u,u]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm28
; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm12
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm21
-; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,6]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm16
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm6
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm27
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm5
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm23
+; AVX512DQ-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,6]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm25 = [0,0,0,0,2,3,0,1,0,18,0,19,18,0,19,0]
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm5, %zmm2, %zmm25
-; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm11
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm25
+; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm4
; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm5
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm4, %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm22
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm5, %xmm4
; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm17
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm5, %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm18
-; AVX512DQ-FCP-NEXT: vporq %xmm2, %xmm6, %xmm31
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm13[0,1,0,1,14],zero,ymm13[14,15,0,1,14,15],zero,ymm13[13,14,15,16,17,16],zero,ymm13[30,31,30,31,16,17],zero,ymm13[31,28,29,30,31]
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm6, %ymm2
+; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm4, %xmm2
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm2
-; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm3
-; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm13, %ymm3
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm11[14],zero,zero,zero,zero,zero,zero,ymm11[15],zero,zero,zero,zero,zero,zero,ymm11[16],zero,zero,zero,zero,zero,zero,ymm11[17],zero,zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[0,1,0,1,14],zero,ymm13[14,15,0,1,14,15],zero,ymm13[13,14,15,16,17,16],zero,ymm13[30,31,30,31,16,17],zero,ymm13[31,28,29,30,31]
+; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[18],zero,zero,zero,zero,ymm11[21],zero,ymm11[19],zero,zero,zero,zero,ymm11[22],zero,ymm11[20]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm13[18,19,20,21],zero,ymm13[19],zero,ymm13[25,26,27,22],zero,ymm13[20],zero
+; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[0,1,14],zero,ymm15[12,13,0,1,14,15],zero,ymm15[3,12,13,2,3,16],zero,ymm15[30,31,28,29,16,17],zero,ymm15[31,18,19,28,29,18],zero
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero,zero,zero,zero,ymm8[18]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[0,1,14],zero,ymm7[12,13,0,1,14,15],zero,ymm7[3,12,13,2,3,16],zero,ymm7[30,31,28,29,16,17],zero,ymm7[31,18,19,28,29,18],zero
+; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero,ymm6[20],zero,zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm15[19],zero,ymm15[21,20,21,22],zero,ymm15[20],zero,ymm15[22,23]
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22],zero,ymm8[20],zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm7[19],zero,ymm7[21,20,21,22],zero,ymm7[20],zero,ymm7[22,23]
+; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[20],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm4
+; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm4
+; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm14[14],zero,zero,zero,zero,zero,zero,ymm14[15],zero,zero,zero,zero,zero,zero,ymm14[16],zero,zero,zero,zero,zero,zero,ymm14[17],zero,zero,zero,zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[13,u,u,u,u,u],zero,ymm10[14,u,u,u,u,u],zero,ymm10[15,u,u,u,u,u],zero,ymm10[16,u,u,u,u,u],zero,ymm10[17,u,u,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,ymm7[14],zero,zero,zero,zero,zero,zero,ymm7[15],zero,zero,zero,zero,zero,zero,ymm7[16],zero,zero,zero,zero,zero,zero,ymm7[17],zero,zero,zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[13,u,u,u,u,u],zero,ymm6[14,u,u,u,u,u],zero,ymm6[15,u,u,u,u,u],zero,ymm6[16,u,u,u,u,u],zero,ymm6[17,u,u,u]
+; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm10
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9
-; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm9, %xmm0
-; AVX512DQ-FCP-NEXT: vporq %xmm1, %xmm0, %xmm20
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm13
-; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm8[u,u,u,7],zero,xmm8[5],zero,xmm8[u,u,u,8],zero,xmm8[6],zero,xmm8[u,u]
-; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm19
-; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm19
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm8
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm8, %xmm1
+; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm21
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm12
+; AVX512DQ-FCP-NEXT: vpshufb %xmm9, %xmm12, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm6
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,7],zero,xmm6[5],zero,xmm6[u,u,u,8],zero,xmm6[6],zero,xmm6[u,u]
+; AVX512DQ-FCP-NEXT: vporq %xmm0, %xmm1, %xmm20
+; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm9
+; AVX512DQ-FCP-NEXT: vpshufb %xmm15, %xmm9, %xmm0
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,0,1,0,0,0,0,16,0,16,0,18,19,0,17]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[1,1,0,0,4,5,6,7]
-; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm2, %zmm16
-; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm7
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm7, %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm6
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm18, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm6, %xmm15
-; AVX512DQ-FCP-NEXT: vporq %xmm3, %xmm15, %xmm18
-; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,5,6]
-; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [2,2,3,3,2,2,3,3]
-; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm3, %ymm0
-; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm3
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm28
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm14
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[1,1,0,0,4,5,6,7]
+; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm16
+; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm11
+; AVX512DQ-FCP-NEXT: vpshufb %xmm10, %xmm11, %xmm10
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm7
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm13
+; AVX512DQ-FCP-NEXT: vporq %xmm10, %xmm13, %xmm18
+; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6]
+; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,2,3,3,2,2,3,3]
+; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9
+; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128]
+; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm10
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm30
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm13
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm3[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [4,5,4,5,5,7,4,5,20,21,22,23,20,21,22,23]
-; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm17, %zmm0
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm3
-; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm27
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm30 = zmm30[2,3,2,3],zmm4[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm15
-; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm2[2,3,2,3],zmm4[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm26
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm2[2,3,2,3],zmm4[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm15
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm2[0,1,0,1],zmm1[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm12, %xmm12
-; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1
+; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm17, %zmm15
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm9
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm4
+; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm27
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm29[2,3,2,3],zmm3[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm0
+; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm24
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm28 = zmm1[2,3,2,3],zmm3[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm2
+; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm3
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm26
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[2,3,2,3],zmm3[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm14[8],xmm0[8],xmm14[9],xmm0[9],xmm14[10],xmm0[10],xmm14[11],xmm0[11],xmm14[12],xmm0[12],xmm14[13],xmm0[13],xmm14[14],xmm0[14],xmm14[15],xmm0[15]
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm12[8],xmm6[8],xmm12[9],xmm6[9],xmm12[10],xmm6[10],xmm12[11],xmm6[11],xmm12[12],xmm6[12],xmm12[13],xmm6[13],xmm12[14],xmm6[14],xmm12[15],xmm6[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm14, %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm22
+; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm4
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm14
+; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15]
+; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm2[0,1,0,1],zmm1[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
-; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm5, %xmm11
-; AVX512DQ-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm31[0,1,0,1],zmm1[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm1
-; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1
-; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm17, %ymm2
-; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm20[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm19[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm6, %xmm6
-; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,0,1],zmm18[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm2[0,1,0,1],zmm1[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
+; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm1
+; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm9, %ymm1
+; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
+; AVX512DQ-FCP-NEXT: vpermd %ymm9, %ymm17, %ymm9
+; AVX512DQ-FCP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm9
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm1, %zmm1
+; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm8, %xmm8
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm21[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm2
+; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm6
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,0,1],zmm20[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm4
+; AVX512DQ-FCP-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,0,1],zmm18[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3]
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
; AVX512DQ-FCP-NEXT: # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm9 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm9 = zmm9[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm7 ^ (zmm10 & (zmm9 ^ zmm7))
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm7[2,3,2,3,6,7,6,7]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm7))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm9))
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm9 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm9 & (zmm7 ^ zmm4))
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vporq {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} zmm4 = zmm4[2,3,2,3,6,7,6,7]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm4))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (mem & (zmm15 ^ zmm7))
; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 128(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm4 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm4 = zmm4[2,3,2,3],mem[2,3,2,3]
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm7, %zmm7 # 64-byte Folded Reload
; AVX512DQ-FCP-NEXT: # zmm7 = zmm7[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm12[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm15[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[2,3,2,3],mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm10 & (zmm0 ^ zmm7))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm21 = zmm24 ^ (zmm10 & (zmm21 ^ zmm24))
-; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm7 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm23 = zmm30 ^ (zmm7 & (zmm23 ^ zmm30))
-; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm10 = mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm13, %zmm10
-; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: # ymm13 = mem[2,3,2,3]
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm3, %zmm13
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm10 ^ (zmm7 & (zmm13 ^ zmm10))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm12, %zmm9 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 ^ (mem & (zmm9 ^ zmm7))
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm7 # 32-byte Folded Reload
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (mem & (zmm14 ^ zmm9))
-; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm0))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm29 ^ (mem & (zmm4 ^ zmm29))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm23))
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm14[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm22[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vshufi64x2 $85, {{[-0-9]+}}(%r{{[sb]}}p), %zmm14, %zmm14 # 64-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # zmm14 = zmm14[2,3,2,3],mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm7 ^ (mem & (zmm14 ^ zmm7))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm4 ^ (zmm9 & (zmm14 ^ zmm4))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm10 = zmm23 ^ (zmm9 & (zmm10 ^ zmm23))
+; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255]
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm28 = zmm29 ^ (zmm4 & (zmm28 ^ zmm29))
+; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm7 = mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm9, %zmm7
+; AVX512DQ-FCP-NEXT: vpermq $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: # ymm9 = mem[2,3,2,3]
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm15, %zmm9
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm9 = zmm7 ^ (zmm4 & (zmm9 ^ zmm7))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm11, %zmm4 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, (%rsp), %zmm12, %zmm7 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm4))
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 32-byte Folded Reload
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 | (zmm0 & mem)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (mem & (zmm13 ^ zmm7))
+; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (mem & (zmm4 ^ zmm14))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm31 ^ (mem & (zmm3 ^ zmm31))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm28))
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm25 ^ (mem & (zmm5 ^ zmm25))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm21))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm10))
; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
; AVX512DQ-FCP-NEXT: vshufi64x2 $84, {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm0 # 64-byte Folded Reload
; AVX512DQ-FCP-NEXT: # zmm0 = zmm0[0,1,2,3],mem[2,3,2,3]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 | (zmm0 & mem)
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm13))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm2))
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm16 ^ (mem & (zmm6 ^ zmm16))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm1 = zmm1 ^ (mem & (zmm1 ^ zmm9))
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm8))
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rax)
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm16 ^ (mem & (zmm2 ^ zmm16))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (mem & (zmm2 ^ zmm6))
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax)
; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 384(%rax)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rax)
-; AVX512DQ-FCP-NEXT: addq $1416, %rsp # imm = 0x588
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 384(%rax)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rax)
+; AVX512DQ-FCP-NEXT: addq $1480, %rsp # imm = 0x5C8
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
; AVX512BW-LABEL: store_i8_stride7_vf64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm0
-; AVX512BW-NEXT: vmovdqa 32(%rsi), %ymm2
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm14
-; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
-; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7]
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14]
-; AVX512BW-NEXT: movl $338170920, %r10d # imm = 0x14281428
-; AVX512BW-NEXT: kmovd %r10d, %k2
-; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm1 {%k2}
-; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
-; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1]
-; AVX512BW-NEXT: vpshufb %ymm9, %ymm2, %ymm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512BW-NEXT: vpshufb %ymm10, %ymm14, %ymm5
-; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[2,3,2,3],zmm1[2,3,2,3]
-; AVX512BW-NEXT: vmovdqa 32(%rdx), %ymm15
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512BW-NEXT: vpshufb %ymm5, %ymm15, %ymm3
-; AVX512BW-NEXT: vmovdqa64 32(%rcx), %ymm17
-; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512BW-NEXT: vpshufb %ymm11, %ymm17, %ymm6
-; AVX512BW-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
-; AVX512BW-NEXT: vpshufb %ymm6, %ymm17, %ymm7
-; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7]
+; AVX512BW-NEXT: vmovdqa64 (%rax), %zmm21
+; AVX512BW-NEXT: vmovdqa 32(%rdx), %ymm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512BW-NEXT: vpshufb %ymm9, %ymm0, %ymm2
+; AVX512BW-NEXT: vmovdqa 32(%rcx), %ymm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512BW-NEXT: vpshufb %ymm11, %ymm1, %ymm3
+; AVX512BW-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20]
+; AVX512BW-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512BW-NEXT: vmovdqa 32(%rsi), %ymm3
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,0,1,1,4,4,5,5]
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6]
; AVX512BW-NEXT: movl $676341840, %r10d # imm = 0x28502850
-; AVX512BW-NEXT: kmovd %r10d, %k3
-; AVX512BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k3}
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm7[2,3,2,3]
-; AVX512BW-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18
-; AVX512BW-NEXT: kmovq %r10, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa64 32(%r9), %ymm16
-; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
-; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX512BW-NEXT: vpshufb %ymm7, %ymm16, %ymm3
-; AVX512BW-NEXT: vmovdqa64 32(%r8), %ymm18
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
-; AVX512BW-NEXT: vpshufb %ymm8, %ymm18, %ymm12
-; AVX512BW-NEXT: vpor %ymm3, %ymm12, %ymm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512BW-NEXT: vpshufb %ymm12, %ymm18, %ymm19
-; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX512BW-NEXT: vpshufb %ymm13, %ymm16, %ymm20
-; AVX512BW-NEXT: vporq %ymm19, %ymm20, %ymm19
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm19[2,3,2,3],zmm3[2,3,2,3]
-; AVX512BW-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060
-; AVX512BW-NEXT: kmovq %r10, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512BW-NEXT: vmovdqa 32(%rax), %ymm3
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
-; AVX512BW-NEXT: vpermw %zmm3, %zmm19, %zmm19
-; AVX512BW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
-; AVX512BW-NEXT: kmovq %rax, %k4
-; AVX512BW-NEXT: vmovdqu8 %zmm19, %zmm1 {%k4}
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512BW-NEXT: vpshufb %ymm19, %ymm15, %ymm21
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512BW-NEXT: vpshufb %ymm20, %ymm17, %ymm22
-; AVX512BW-NEXT: vporq %ymm21, %ymm22, %ymm21
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512BW-NEXT: vpshufb %ymm22, %ymm15, %ymm15
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512BW-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpshufb %ymm25, %ymm17, %ymm17
-; AVX512BW-NEXT: vporq %ymm15, %ymm17, %ymm15
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm15
-; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm17 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,0,1,1,4,4,5,5]
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm21 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6]
-; AVX512BW-NEXT: vpshufb %ymm21, %ymm2, %ymm17 {%k3}
+; AVX512BW-NEXT: kmovd %r10d, %k1
+; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm6 {%k1}
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
+; AVX512BW-NEXT: vpshufb %ymm13, %ymm5, %ymm7
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512BW-NEXT: vpshufb %ymm14, %ymm3, %ymm10
+; AVX512BW-NEXT: vpor %ymm7, %ymm10, %ymm7
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm20
+; AVX512BW-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
+; AVX512BW-NEXT: kmovq %r10, %k3
+; AVX512BW-NEXT: vmovdqu8 %zmm2, %zmm20 {%k3}
+; AVX512BW-NEXT: vmovdqa 32(%r9), %ymm2
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
+; AVX512BW-NEXT: vpshufb %ymm15, %ymm2, %ymm6
+; AVX512BW-NEXT: vmovdqa 32(%r8), %ymm12
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
+; AVX512BW-NEXT: vpshufb %ymm16, %ymm12, %ymm10
+; AVX512BW-NEXT: vpor %ymm6, %ymm10, %ymm6
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm17 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm12[18],zero,ymm12[20,21,20,21],zero,ymm12[19],zero,ymm12[19,20,21,22],zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm18 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22]
+; AVX512BW-NEXT: vporq %ymm17, %ymm18, %ymm17
; AVX512BW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,3,2,3]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512BW-NEXT: vpshufb %ymm23, %ymm14, %ymm14
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512BW-NEXT: vpshufb %ymm24, %ymm2, %ymm2
-; AVX512BW-NEXT: vpor %ymm2, %ymm14, %ymm2
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2
-; AVX512BW-NEXT: movabsq $3485998880071096368, %rax # imm = 0x3060C183060C1830
-; AVX512BW-NEXT: kmovq %rax, %k4
-; AVX512BW-NEXT: vmovdqu8 %zmm15, %zmm2 {%k4}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm14 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
-; AVX512BW-NEXT: vpshufb %ymm14, %ymm16, %ymm17
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512BW-NEXT: vpshufb %ymm15, %ymm18, %ymm26
-; AVX512BW-NEXT: vporq %ymm17, %ymm26, %ymm17
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512BW-NEXT: vpshufb %ymm26, %ymm18, %ymm18
-; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512BW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-NEXT: vpshufb %ymm28, %ymm16, %ymm16
-; AVX512BW-NEXT: vporq %ymm18, %ymm16, %ymm16
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm16
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512BW-NEXT: vpermw %ymm3, %ymm29, %ymm17
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm18 = ymm3[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm17
+; AVX512BW-NEXT: vmovdqa 32(%rax), %ymm8
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512BW-NEXT: vpermw %ymm8, %ymm6, %ymm18
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm19 = ymm8[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm18
; AVX512BW-NEXT: movabsq $145249953336295682, %rax # imm = 0x204081020408102
-; AVX512BW-NEXT: kmovq %rax, %k5
-; AVX512BW-NEXT: vmovdqu8 %zmm17, %zmm16 {%k5}
+; AVX512BW-NEXT: kmovq %rax, %k2
+; AVX512BW-NEXT: vmovdqu8 %zmm18, %zmm17 {%k2}
; AVX512BW-NEXT: movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3
+; AVX512BW-NEXT: kmovq %rax, %k2
+; AVX512BW-NEXT: vmovdqu8 %zmm17, %zmm20 {%k2}
+; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm17 = ymm5[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
+; AVX512BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[2,2,3,3,6,6,7,7]
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm18 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14]
+; AVX512BW-NEXT: movl $338170920, %eax # imm = 0x14281428
+; AVX512BW-NEXT: kmovd %eax, %k4
+; AVX512BW-NEXT: vpshufb %ymm18, %ymm3, %ymm17 {%k4}
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[9],zero,ymm3[7],zero,zero,zero,zero,ymm3[10],zero,ymm3[8],zero,zero,zero,zero,ymm3[11],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[7],zero,zero,zero,zero,ymm5[10],zero,ymm5[8],zero,zero,zero,zero,ymm5[11],zero,ymm5[9],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero,ymm5[25]
+; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm3[2,3,2,3],zmm17[2,3,2,3]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[8,9],zero,ymm0[7],zero,ymm0[5,6,7,10],zero,ymm0[8],zero,ymm0[12,13,10,11,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm17 = zero,zero,ymm1[9],zero,ymm1[7],zero,zero,zero,zero,ymm1[10],zero,ymm1[8],zero,zero,zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero
+; AVX512BW-NEXT: vporq %ymm3, %ymm17, %ymm3
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm26 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
+; AVX512BW-NEXT: vpshufb %ymm26, %ymm1, %ymm1
+; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7]
+; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[2,3,2,3],zmm1[2,3,2,3]
+; AVX512BW-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18
+; AVX512BW-NEXT: kmovq %rax, %k2
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm7 {%k2}
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
+; AVX512BW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vpshufb %ymm24, %ymm2, %ymm0
+; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512BW-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT: vpshufb %ymm25, %ymm12, %ymm1
+; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[8,9,8,9],zero,ymm12[7],zero,ymm12[7,8,9,10],zero,ymm12[8],zero,ymm12[14,15,24,25,24,25],zero,ymm12[23],zero,ymm12[23,24,25,26],zero,ymm12[24],zero,ymm12[30,31]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[9],zero,ymm2[7],zero,zero,zero,zero,ymm2[10],zero,ymm2[8],zero,zero,zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
+; AVX512BW-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[2,3,2,3]
+; AVX512BW-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060
+; AVX512BW-NEXT: kmovq %rax, %k2
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm7 {%k2}
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
+; AVX512BW-NEXT: vpermw %zmm8, %zmm0, %zmm0
+; AVX512BW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
; AVX512BW-NEXT: kmovq %rax, %k5
-; AVX512BW-NEXT: vmovdqu8 %zmm16, %zmm2 {%k5}
-; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm16
-; AVX512BW-NEXT: vpshufb %ymm5, %ymm16, %ymm5
-; AVX512BW-NEXT: vmovdqa64 (%rcx), %ymm17
-; AVX512BW-NEXT: vpshufb %ymm11, %ymm17, %ymm11
-; AVX512BW-NEXT: vpor %ymm5, %ymm11, %ymm5
-; AVX512BW-NEXT: vpshufb %ymm22, %ymm16, %ymm11
-; AVX512BW-NEXT: vpshufb %ymm25, %ymm17, %ymm18
-; AVX512BW-NEXT: vporq %ymm11, %ymm18, %ymm11
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm11[2,3,2,3],zmm5[2,3,2,3]
-; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm25
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm27
-; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm27[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,1,1,4,4,5,5]
-; AVX512BW-NEXT: vpshufb %ymm21, %ymm25, %ymm11 {%k3}
-; AVX512BW-NEXT: vpshufb %ymm9, %ymm25, %ymm9
-; AVX512BW-NEXT: vpshufb %ymm10, %ymm27, %ymm10
-; AVX512BW-NEXT: vpor %ymm9, %ymm10, %ymm9
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[2,3,2,3],zmm9[2,3,2,3]
-; AVX512BW-NEXT: vmovdqu8 %zmm9, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqa64 (%r8), %ymm18
-; AVX512BW-NEXT: vpshufb %ymm12, %ymm18, %ymm9
-; AVX512BW-NEXT: vmovdqa64 (%r9), %ymm21
-; AVX512BW-NEXT: vpshufb %ymm13, %ymm21, %ymm10
-; AVX512BW-NEXT: vpor %ymm9, %ymm10, %ymm9
-; AVX512BW-NEXT: vpshufb %ymm26, %ymm18, %ymm10
-; AVX512BW-NEXT: vpshufb %ymm28, %ymm21, %ymm11
-; AVX512BW-NEXT: vpor %ymm10, %ymm11, %ymm10
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3]
-; AVX512BW-NEXT: vpermw %zmm0, %zmm29, %zmm10
-; AVX512BW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
-; AVX512BW-NEXT: kmovq %rax, %k5
-; AVX512BW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k5}
-; AVX512BW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
-; AVX512BW-NEXT: kmovq %rax, %k5
-; AVX512BW-NEXT: vmovdqu8 %zmm9, %zmm5 {%k5}
-; AVX512BW-NEXT: vpshufb %ymm19, %ymm16, %ymm9
-; AVX512BW-NEXT: vpshufb %ymm20, %ymm17, %ymm10
-; AVX512BW-NEXT: vpor %ymm9, %ymm10, %ymm9
-; AVX512BW-NEXT: vmovdqa (%rdx), %xmm10
-; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm19
-; AVX512BW-NEXT: vmovdqa (%rcx), %xmm11
-; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm20
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512BW-NEXT: vpshufb %xmm22, %xmm12, %xmm12
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm12, %zmm26
-; AVX512BW-NEXT: vpshufb %ymm23, %ymm27, %ymm9
-; AVX512BW-NEXT: vpshufb %ymm24, %ymm25, %ymm12
-; AVX512BW-NEXT: vpor %ymm9, %ymm12, %ymm9
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm12
-; AVX512BW-NEXT: vmovdqa (%rsi), %xmm13
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm23 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512BW-NEXT: vpshufb %xmm23, %xmm24, %xmm24
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,1,0,1]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm24, %zmm9
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm7 {%k5}
+; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm22
+; AVX512BW-NEXT: vpshufb %ymm9, %ymm22, %ymm0
+; AVX512BW-NEXT: vmovdqa64 (%rcx), %ymm23
+; AVX512BW-NEXT: vpshufb %ymm11, %ymm23, %ymm1
+; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vmovdqa (%rdx), %xmm11
+; AVX512BW-NEXT: vmovdqa (%rcx), %xmm12
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm28
+; AVX512BW-NEXT: vpshufb %ymm13, %ymm28, %ymm1
+; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm31
+; AVX512BW-NEXT: vpshufb %ymm14, %ymm31, %ymm2
+; AVX512BW-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm13
+; AVX512BW-NEXT: vmovdqa (%rsi), %xmm14
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512BW-NEXT: vpshufb %xmm19, %xmm2, %xmm2
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm9
; AVX512BW-NEXT: movabsq $435749860008887046, %rax # imm = 0x60C183060C18306
; AVX512BW-NEXT: kmovq %rax, %k5
-; AVX512BW-NEXT: vmovdqu8 %zmm26, %zmm9 {%k5}
-; AVX512BW-NEXT: vpshufb %ymm14, %ymm21, %ymm14
-; AVX512BW-NEXT: vpshufb %ymm15, %ymm18, %ymm15
-; AVX512BW-NEXT: vporq %ymm14, %ymm15, %ymm24
-; AVX512BW-NEXT: vmovdqa (%r9), %xmm14
-; AVX512BW-NEXT: vmovdqa (%r8), %xmm15
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512BW-NEXT: vpshufb %xmm26, %xmm28, %xmm28
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,1,0,1]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm24, %zmm28, %zmm24
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
-; AVX512BW-NEXT: vpermw %zmm0, %zmm28, %zmm28
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm9 {%k5}
+; AVX512BW-NEXT: vmovdqa64 (%r9), %ymm29
+; AVX512BW-NEXT: vpshufb %ymm15, %ymm29, %ymm0
+; AVX512BW-NEXT: vmovdqa (%r8), %ymm6
+; AVX512BW-NEXT: vpshufb %ymm16, %ymm6, %ymm1
+; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vmovdqa (%r9), %xmm15
+; AVX512BW-NEXT: vmovdqa64 (%r8), %xmm16
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm10 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512BW-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
+; AVX512BW-NEXT: vpermw %zmm21, %zmm1, %zmm1
; AVX512BW-NEXT: movabsq $2323999253380730912, %rax # imm = 0x2040810204081020
; AVX512BW-NEXT: kmovq %rax, %k5
-; AVX512BW-NEXT: vmovdqu8 %zmm28, %zmm24 {%k5}
-; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm28
+; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k5}
+; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm27
; AVX512BW-NEXT: movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38
; AVX512BW-NEXT: kmovq %rax, %k5
-; AVX512BW-NEXT: vmovdqu8 %zmm24, %zmm9 {%k5}
-; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm29
-; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm24 = ymm27[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
-; AVX512BW-NEXT: vpshufd {{.*#+}} ymm27 = ymm24[2,2,3,3,6,6,7,7]
-; AVX512BW-NEXT: vpshufb %ymm4, %ymm25, %ymm27 {%k2}
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm24 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512BW-NEXT: vpshufb %xmm24, %xmm4, %xmm4
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm27[2,3,2,3],zmm4[0,1,0,1]
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm9 {%k5}
+; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm28[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
+; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX512BW-NEXT: vpshufb %ymm18, %ymm31, %ymm0 {%k4}
+; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm30
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm27[0],xmm30[0],xmm27[1],xmm30[1],xmm27[2],xmm30[2],xmm27[3],xmm30[3],xmm27[4],xmm30[4],xmm27[5],xmm30[5],xmm27[6],xmm30[6],xmm27[7],xmm30[7]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512BW-NEXT: vpshufb %xmm17, %xmm1, %xmm1
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[2,3,2,3],zmm1[0,1,0,1]
+; AVX512BW-NEXT: vpshufb %ymm26, %ymm23, %ymm18
+; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm22[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7]
+; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm18 {%k1}
+; AVX512BW-NEXT: vmovdqa 32(%rdx), %xmm5
+; AVX512BW-NEXT: vmovdqa 32(%rcx), %xmm2
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm26 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512BW-NEXT: vpshufb %xmm26, %xmm0, %xmm0
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm0[0,1,0,1]
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm18 {%k3}
+; AVX512BW-NEXT: vpshufb %ymm24, %ymm29, %ymm0
+; AVX512BW-NEXT: vpshufb %ymm25, %ymm6, %ymm3
+; AVX512BW-NEXT: vpor %ymm0, %ymm3, %ymm1
; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm25
-; AVX512BW-NEXT: vpshufb %ymm6, %ymm17, %ymm4
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm19[0],xmm20[0],xmm19[1],xmm20[1],xmm19[2],xmm20[2],xmm19[3],xmm20[3],xmm19[4],xmm20[4],xmm19[5],xmm20[5],xmm19[6],xmm20[6],xmm19[7],xmm20[7]
-; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm16 = ymm16[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512BW-NEXT: vpshufd {{.*#+}} ymm16 = ymm16[0,2,3,3,4,6,7,7]
-; AVX512BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k3}
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512BW-NEXT: vpshufb %xmm16, %xmm6, %xmm6
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[2,3,2,3],zmm6[0,1,0,1]
-; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm17
-; AVX512BW-NEXT: vmovdqu8 %zmm27, %zmm4 {%k4}
-; AVX512BW-NEXT: vpshufb %ymm7, %ymm21, %ymm6
-; AVX512BW-NEXT: vpshufb %ymm8, %ymm18, %ymm7
-; AVX512BW-NEXT: vpor %ymm6, %ymm7, %ymm7
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm17[0],xmm25[0],xmm17[1],xmm25[1],xmm17[2],xmm25[2],xmm17[3],xmm25[3],xmm17[4],xmm25[4],xmm17[5],xmm25[5],xmm17[6],xmm25[6],xmm17[7],xmm25[7]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512BW-NEXT: vpshufb %xmm6, %xmm8, %xmm8
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[2,3,2,3],zmm8[0,1,0,1]
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
-; AVX512BW-NEXT: vpermw %zmm0, %zmm8, %zmm8
+; AVX512BW-NEXT: vmovdqa 32(%r8), %xmm3
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm25[0],xmm3[1],xmm25[1],xmm3[2],xmm25[2],xmm3[3],xmm25[3],xmm3[4],xmm25[4],xmm3[5],xmm25[5],xmm3[6],xmm25[6],xmm3[7],xmm25[7]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm24 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512BW-NEXT: vpshufb %xmm24, %xmm0, %xmm0
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1]
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
+; AVX512BW-NEXT: vpermw %zmm21, %zmm1, %zmm1
; AVX512BW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408
-; AVX512BW-NEXT: kmovq %rax, %k2
-; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm7 {%k2}
+; AVX512BW-NEXT: kmovq %rax, %k3
+; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3}
; AVX512BW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
-; AVX512BW-NEXT: kmovq %rax, %k2
-; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm4 {%k2}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512BW-NEXT: vpshufb %xmm7, %xmm20, %xmm8
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512BW-NEXT: vpshufb %xmm18, %xmm19, %xmm21
-; AVX512BW-NEXT: vporq %xmm8, %xmm21, %xmm8
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15]
-; AVX512BW-NEXT: vpshufb %xmm22, %xmm19, %xmm19
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm19[0,1,0,1]
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm29, %xmm20
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512BW-NEXT: vpshufb %xmm21, %xmm28, %xmm22
-; AVX512BW-NEXT: vporq %xmm20, %xmm22, %xmm20
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
-; AVX512BW-NEXT: vpshufb %xmm23, %xmm22, %xmm22
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,0,1],zmm22[0,1,0,1]
-; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm20 {%k1}
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm25, %xmm22
+; AVX512BW-NEXT: kmovq %rax, %k3
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm18 {%k3}
+; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm28[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512BW-NEXT: vpshufb %ymm4, %ymm31, %ymm0 {%k1}
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm31[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm31[23],zero,zero,zero,zero,ymm31[26],zero,ymm31[24],zero,zero,zero,zero,ymm31[27],zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm28[23],zero,zero,zero,zero,ymm28[26],zero,ymm28[24],zero,zero,zero,zero,ymm28[27],zero,ymm28[25]
+; AVX512BW-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm22[18,19,20,21],zero,ymm22[19],zero,ymm22[25,26,27,22],zero,ymm22[20],zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm23[18],zero,zero,zero,zero,ymm23[21],zero,ymm23[19],zero,zero,zero,zero,ymm23[22],zero,ymm23[20]
+; AVX512BW-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm22[23],zero,ymm22[21,22,23,26],zero,ymm22[24],zero,ymm22[28,29,26,27]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm22 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm23[25],zero,ymm23[23],zero,zero,zero,zero,ymm23[26],zero,ymm23[24],zero,zero,zero,zero
+; AVX512BW-NEXT: vporq %ymm4, %ymm22, %ymm4
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm1[2,3,2,3],zmm4[2,3,2,3]
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm4 {%k2}
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,ymm6[20,21,20,21],zero,ymm6[19],zero,ymm6[19,20,21,22],zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm29[20],zero,ymm29[18],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22]
+; AVX512BW-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm29[25],zero,ymm29[23],zero,zero,zero,zero,ymm29[26],zero,ymm29[24],zero,zero
+; AVX512BW-NEXT: vpor %ymm1, %ymm6, %ymm1
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3]
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512BW-NEXT: vpermw %zmm21, %zmm1, %zmm1
+; AVX512BW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
+; AVX512BW-NEXT: kmovq %rax, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
+; AVX512BW-NEXT: kmovq %rax, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm4 {%k1}
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm2, %xmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
+; AVX512BW-NEXT: vpshufb %xmm6, %xmm5, %xmm22
+; AVX512BW-NEXT: vporq %xmm1, %xmm22, %xmm1
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,0,1],zmm2[0,1,0,1]
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm30, %xmm5
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512BW-NEXT: vpshufb %xmm22, %xmm27, %xmm23
+; AVX512BW-NEXT: vporq %xmm5, %xmm23, %xmm5
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm30[8],xmm27[8],xmm30[9],xmm27[9],xmm30[10],xmm27[10],xmm30[11],xmm27[11],xmm30[12],xmm27[12],xmm30[13],xmm27[13],xmm30[14],xmm27[14],xmm30[15],xmm27[15]
+; AVX512BW-NEXT: vpshufb %xmm19, %xmm23, %xmm19
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm19[0,1,0,1]
+; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm5 {%k2}
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm25, %xmm19
; AVX512BW-NEXT: vmovdqa64 {{.*#+}} xmm23 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512BW-NEXT: vpshufb %xmm23, %xmm17, %xmm27
-; AVX512BW-NEXT: vporq %xmm22, %xmm27, %xmm22
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm25[8],xmm17[9],xmm25[9],xmm17[10],xmm25[10],xmm17[11],xmm25[11],xmm17[12],xmm25[12],xmm17[13],xmm25[13],xmm17[14],xmm25[14],xmm17[15],xmm25[15]
-; AVX512BW-NEXT: vpshufb %xmm26, %xmm17, %xmm17
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm22[0,1,0,1],zmm17[0,1,0,1]
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512BW-NEXT: vpermw %zmm3, %zmm22, %zmm3
+; AVX512BW-NEXT: vpshufb %xmm23, %xmm3, %xmm27
+; AVX512BW-NEXT: vporq %xmm19, %xmm27, %xmm19
+; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm25[8],xmm3[9],xmm25[9],xmm3[10],xmm25[10],xmm3[11],xmm25[11],xmm3[12],xmm25[12],xmm3[13],xmm25[13],xmm3[14],xmm25[14],xmm3[15],xmm25[15]
+; AVX512BW-NEXT: vpshufb %xmm10, %xmm3, %xmm3
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm19[0,1,0,1],zmm3[0,1,0,1]
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512BW-NEXT: vpermw %zmm8, %zmm10, %zmm8
; AVX512BW-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204
; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm17 {%k1}
+; AVX512BW-NEXT: vmovdqu8 %zmm8, %zmm3 {%k1}
; AVX512BW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm17, %zmm20 {%k1}
-; AVX512BW-NEXT: vpshufb %xmm7, %xmm11, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm18, %xmm10, %xmm7
-; AVX512BW-NEXT: vpor %xmm3, %xmm7, %xmm3
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
-; AVX512BW-NEXT: vpshufb %xmm16, %xmm7, %xmm7
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm7[0,1,0,1],zmm3[0,1,0,1]
-; AVX512BW-NEXT: vpshufb %xmm19, %xmm13, %xmm7
-; AVX512BW-NEXT: vpshufb %xmm21, %xmm12, %xmm10
-; AVX512BW-NEXT: vpor %xmm7, %xmm10, %xmm7
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512BW-NEXT: vpshufb %xmm24, %xmm10, %xmm10
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm10[0,1,0,1],zmm7[0,1,0,1]
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm5 {%k1}
+; AVX512BW-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512BW-NEXT: vpshufb %xmm6, %xmm11, %xmm3
+; AVX512BW-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; AVX512BW-NEXT: vpshufb %xmm26, %xmm3, %xmm3
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,0,1],zmm0[0,1,0,1]
+; AVX512BW-NEXT: vpshufb %xmm2, %xmm14, %xmm2
+; AVX512BW-NEXT: vpshufb %xmm22, %xmm13, %xmm3
+; AVX512BW-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
+; AVX512BW-NEXT: vpshufb %xmm17, %xmm3, %xmm3
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,0,1],zmm2[0,1,0,1]
; AVX512BW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C
; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm7 {%k1}
-; AVX512BW-NEXT: vpshufb %xmm8, %xmm14, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm23, %xmm15, %xmm8
-; AVX512BW-NEXT: vpor %xmm3, %xmm8, %xmm3
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; AVX512BW-NEXT: vpshufb %xmm6, %xmm8, %xmm6
-; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm6[0,1,0,1],zmm3[0,1,0,1]
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
-; AVX512BW-NEXT: vpermw %zmm0, %zmm6, %zmm0
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm15, %xmm0
+; AVX512BW-NEXT: vpshufb %xmm23, %xmm16, %xmm1
+; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
+; AVX512BW-NEXT: vpshufb %xmm24, %xmm1, %xmm1
+; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
+; AVX512BW-NEXT: vpermw %zmm21, %zmm1, %zmm1
; AVX512BW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm7 {%k1}
+; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm20, 256(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm2, 320(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rax)
+; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rax)
+; AVX512BW-NEXT: vmovdqa64 %zmm5, 256(%rax)
+; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax)
+; AVX512BW-NEXT: vmovdqa64 %zmm20, 320(%rax)
+; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rax)
; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rax)
-; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%rax)
+; AVX512BW-NEXT: vmovdqa64 %zmm7, 384(%rax)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BW-FCP-LABEL: store_i8_stride7_vf64:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm1
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128]
-; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm11
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm2
-; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm2
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm5
-; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm5, %ymm2
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm0[2,3,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm16
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
-; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm16, %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm17
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
-; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm17, %ymm5
-; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
-; AVX512BW-FCP-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm17, %ymm5
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm16, %ymm6
-; AVX512BW-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[2,3,2,3],zmm0[2,3,2,3]
-; AVX512BW-FCP-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18
-; AVX512BW-FCP-NEXT: kmovq %r10, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %ymm18
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
-; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm18, %ymm2
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm22
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
-; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm22, %ymm12
-; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm12, %ymm2
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm22, %ymm14
-; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512BW-FCP-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm18, %ymm15
-; AVX512BW-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm14[2,3,2,3],zmm2[2,3,2,3]
-; AVX512BW-FCP-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060
-; AVX512BW-FCP-NEXT: kmovq %r10, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa 32(%rax), %ymm2
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm14, %zmm14
-; AVX512BW-FCP-NEXT: movabsq $-9150747060186627967, %r10 # imm = 0x8102040810204081
-; AVX512BW-FCP-NEXT: kmovq %r10, %k2
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm0 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm15
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm1, %ymm23
-; AVX512BW-FCP-NEXT: vporq %ymm15, %ymm23, %ymm15
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm11
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512BW-FCP-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm1, %ymm1
-; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm11, %ymm1
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm23
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm16, %ymm1
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm17, %ymm26
-; AVX512BW-FCP-NEXT: vporq %ymm1, %ymm26, %ymm1
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm16, %ymm16
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
-; AVX512BW-FCP-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm17, %ymm17
-; AVX512BW-FCP-NEXT: vporq %ymm16, %ymm17, %ymm16
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm1, %zmm1
+; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm18
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm18, %ymm0
+; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm26
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm26, %ymm1
+; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm18[18,19,20,21],zero,ymm18[19],zero,ymm18[25,26,27,22],zero,ymm18[20],zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm26[18],zero,zero,zero,zero,ymm26[21],zero,ymm26[19],zero,zero,zero,zero,ymm26[22],zero,ymm26[20]
+; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm28
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
+; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm28, %ymm0
+; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm29
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm29, %ymm4
+; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm28[19],zero,ymm28[21,20,21,22],zero,ymm28[20],zero,ymm28[22,23]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22],zero,ymm29[20],zero,zero
+; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm25
; AVX512BW-FCP-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
-; AVX512BW-FCP-NEXT: kmovq %r10, %k2
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm23, %zmm1 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm18, %ymm23
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm22, %ymm28
-; AVX512BW-FCP-NEXT: vporq %ymm23, %ymm28, %ymm23
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm22, %ymm22
-; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512BW-FCP-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm18, %ymm18
-; AVX512BW-FCP-NEXT: vporq %ymm22, %ymm18, %ymm18
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,3,2,3]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm23, %zmm18
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512BW-FCP-NEXT: vpermw %ymm2, %ymm30, %ymm22
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm23 = ymm2[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm23, %zmm22
+; AVX512BW-FCP-NEXT: kmovq %r10, %k1
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm25 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %ymm30
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
+; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm30, %ymm3
+; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm31
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
+; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm31, %ymm4
+; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm31[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm31[18],zero,ymm31[20,21,20,21],zero,ymm31[19],zero,ymm31[19,20,21,22],zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm30[20],zero,ymm30[18],zero,zero,zero,zero,ymm30[21],zero,ymm30[19],zero,zero,zero,zero,ymm30[22]
+; AVX512BW-FCP-NEXT: vpor %ymm9, %ymm13, %ymm9
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm9
+; AVX512BW-FCP-NEXT: vmovdqa64 32(%rax), %ymm16
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512BW-FCP-NEXT: vpermw %ymm16, %ymm0, %ymm13
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm16[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm17, %zmm13
; AVX512BW-FCP-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102
-; AVX512BW-FCP-NEXT: kmovq %r10, %k3
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm18 {%k3}
+; AVX512BW-FCP-NEXT: kmovq %r10, %k2
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm9 {%k2}
; AVX512BW-FCP-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3
-; AVX512BW-FCP-NEXT: kmovq %r10, %k3
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm1 {%k3}
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm18
-; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm18, %ymm19
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22
-; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm22, %ymm20
-; AVX512BW-FCP-NEXT: vporq %ymm19, %ymm20, %ymm19
-; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm22, %ymm20
-; AVX512BW-FCP-NEXT: vpshufb %ymm27, %ymm18, %ymm23
-; AVX512BW-FCP-NEXT: vporq %ymm20, %ymm23, %ymm20
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[2,3,2,3],zmm19[2,3,2,3]
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm19
-; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm19, %ymm8
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %ymm23
-; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm23, %ymm10
-; AVX512BW-FCP-NEXT: vpor %ymm8, %ymm10, %ymm8
-; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm10
-; AVX512BW-FCP-NEXT: vpshufb %ymm24, %ymm19, %ymm24
-; AVX512BW-FCP-NEXT: vpshufb %ymm25, %ymm23, %ymm25
-; AVX512BW-FCP-NEXT: vporq %ymm24, %ymm25, %ymm24
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm24[2,3,2,3],zmm8[2,3,2,3]
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm8 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %ymm20
-; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm20, %ymm12
-; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %ymm24
-; AVX512BW-FCP-NEXT: vpshufb %ymm13, %ymm24, %ymm13
-; AVX512BW-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12
-; AVX512BW-FCP-NEXT: vpshufb %ymm28, %ymm20, %ymm13
-; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm24, %ymm25
-; AVX512BW-FCP-NEXT: vporq %ymm13, %ymm25, %ymm13
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[2,3,2,3],zmm12[2,3,2,3]
-; AVX512BW-FCP-NEXT: vpermw %zmm10, %zmm30, %zmm13
-; AVX512BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
-; AVX512BW-FCP-NEXT: kmovq %rax, %k3
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm12 {%k3}
-; AVX512BW-FCP-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
-; AVX512BW-FCP-NEXT: kmovq %rax, %k3
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm12, %zmm8 {%k3}
-; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm19, %ymm12
-; AVX512BW-FCP-NEXT: vpshufb %ymm21, %ymm23, %ymm13
-; AVX512BW-FCP-NEXT: vpor %ymm12, %ymm13, %ymm14
-; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm12
-; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm13
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm25, %xmm25
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,0,1]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm25, %zmm26
-; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm22, %ymm11
-; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm18, %ymm14
-; AVX512BW-FCP-NEXT: vpor %ymm11, %ymm14, %ymm11
-; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm14
-; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm15
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm27, %xmm27
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11
-; AVX512BW-FCP-NEXT: movabsq $435749860008887046, %rax # imm = 0x60C183060C18306
-; AVX512BW-FCP-NEXT: kmovq %rax, %k3
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm26, %zmm11 {%k3}
-; AVX512BW-FCP-NEXT: vpshufb %ymm16, %ymm24, %ymm16
-; AVX512BW-FCP-NEXT: vpshufb %ymm17, %ymm20, %ymm17
-; AVX512BW-FCP-NEXT: vporq %ymm16, %ymm17, %ymm27
-; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %xmm16
-; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %xmm17
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm28, %xmm28
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,1,0,1]
-; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm28, %zmm27
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
-; AVX512BW-FCP-NEXT: vpermw %zmm10, %zmm28, %zmm28
+; AVX512BW-FCP-NEXT: kmovq %r10, %k2
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm25 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm9
+; AVX512BW-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm13
+; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm10
+; AVX512BW-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm11
+; AVX512BW-FCP-NEXT: vpor %ymm13, %ymm11, %ymm11
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm17
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm19
+; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15]
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm27 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm13, %xmm13
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm22
+; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm8
+; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm13
+; AVX512BW-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm12
+; AVX512BW-FCP-NEXT: vpor %ymm8, %ymm12, %ymm8
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm20
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm21
+; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm8
+; AVX512BW-FCP-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306
+; AVX512BW-FCP-NEXT: kmovq %r10, %k2
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm8 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm12
+; AVX512BW-FCP-NEXT: vpshufb %ymm14, %ymm12, %ymm22
+; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm14
+; AVX512BW-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm15
+; AVX512BW-FCP-NEXT: vporq %ymm22, %ymm15, %ymm15
+; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %xmm22
+; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %xmm23
+; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0
+; AVX512BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
+; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm2, %zmm2
; AVX512BW-FCP-NEXT: movabsq $2323999253380730912, %rax # imm = 0x2040810204081020
-; AVX512BW-FCP-NEXT: kmovq %rax, %k3
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm28, %zmm27 {%k3}
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm28
+; AVX512BW-FCP-NEXT: kmovq %rax, %k2
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2}
; AVX512BW-FCP-NEXT: movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38
+; AVX512BW-FCP-NEXT: kmovq %rax, %k2
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm8 {%k2}
+; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128]
+; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm26, %ymm2
+; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
+; AVX512BW-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm18, %ymm3
+; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm18[8,9],zero,ymm18[7],zero,ymm18[5,6,7,10],zero,ymm18[8],zero,ymm18[12,13,10,11,24,25],zero,ymm18[23],zero,ymm18[21,22,23,26],zero,ymm18[24],zero,ymm18[28,29,26,27]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = zero,zero,ymm26[9],zero,ymm26[7],zero,zero,zero,zero,ymm26[10],zero,ymm26[8],zero,zero,zero,zero,zero,zero,ymm26[25],zero,ymm26[23],zero,zero,zero,zero,ymm26[26],zero,ymm26[24],zero,zero,zero,zero
+; AVX512BW-FCP-NEXT: vporq %ymm3, %ymm18, %ymm3
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[2,3,2,3],zmm2[2,3,2,3]
+; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm28, %ymm18
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
+; AVX512BW-FCP-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm29, %ymm4
+; AVX512BW-FCP-NEXT: vporq %ymm18, %ymm4, %ymm4
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm29[9],zero,ymm29[7],zero,zero,zero,zero,ymm29[10],zero,ymm29[8],zero,zero,zero,zero,ymm29[11],zero,ymm29[25],zero,ymm29[23],zero,zero,zero,zero,ymm29[26],zero,ymm29[24],zero,zero,zero,zero,ymm29[27],zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm28 = zero,ymm28[7],zero,zero,zero,zero,ymm28[10],zero,ymm28[8],zero,zero,zero,zero,ymm28[11],zero,ymm28[9],zero,ymm28[23],zero,zero,zero,zero,ymm28[26],zero,ymm28[24],zero,zero,zero,zero,ymm28[27],zero,ymm28[25]
+; AVX512BW-FCP-NEXT: vporq %ymm18, %ymm28, %ymm18
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm4[2,3,2,3]
+; AVX512BW-FCP-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18
+; AVX512BW-FCP-NEXT: kmovq %rax, %k2
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm18 {%k2}
+; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
+; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm30, %ymm4
+; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512BW-FCP-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm31, %ymm28
+; AVX512BW-FCP-NEXT: vporq %ymm4, %ymm28, %ymm4
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm28 = ymm31[8,9,8,9],zero,ymm31[7],zero,ymm31[7,8,9,10],zero,ymm31[8],zero,ymm31[14,15,24,25,24,25],zero,ymm31[23],zero,ymm31[23,24,25,26],zero,ymm31[24],zero,ymm31[30,31]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm30 = zero,zero,zero,zero,ymm30[9],zero,ymm30[7],zero,zero,zero,zero,ymm30[10],zero,ymm30[8],zero,zero,zero,zero,zero,zero,ymm30[25],zero,ymm30[23],zero,zero,zero,zero,ymm30[26],zero,ymm30[24],zero,zero
+; AVX512BW-FCP-NEXT: vporq %ymm28, %ymm30, %ymm28
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm28[2,3,2,3],zmm4[2,3,2,3]
+; AVX512BW-FCP-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060
+; AVX512BW-FCP-NEXT: kmovq %rax, %k2
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm18 {%k2}
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
+; AVX512BW-FCP-NEXT: vpermw %zmm16, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
; AVX512BW-FCP-NEXT: kmovq %rax, %k3
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm27, %zmm11 {%k3}
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm27
-; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm22, %ymm7
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm22
-; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm18, %ymm9
-; AVX512BW-FCP-NEXT: vpor %ymm7, %ymm9, %ymm9
-; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm27[0],xmm22[0],xmm27[1],xmm22[1],xmm27[2],xmm22[2],xmm27[3],xmm22[3],xmm27[4],xmm22[4],xmm27[5],xmm22[5],xmm27[6],xmm22[6],xmm27[7],xmm22[7]
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm18, %xmm18
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm9[2,3,2,3],zmm18[0,1,0,1]
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm29
-; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm23, %ymm3
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm23
-; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm19, %ymm4
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm18 {%k3}
+; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm3
+; AVX512BW-FCP-NEXT: vpshufb %ymm26, %ymm13, %ymm4
; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
-; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7]
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm4[0,1,0,1]
-; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm19
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm3 {%k2}
-; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm24, %ymm4
-; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm20, %ymm5
-; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm5, %ymm5
-; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm19[0],xmm23[0],xmm19[1],xmm23[1],xmm19[2],xmm23[2],xmm19[3],xmm23[3],xmm19[4],xmm23[4],xmm19[5],xmm23[5],xmm19[6],xmm23[6],xmm19[7],xmm23[7]
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[2,3,2,3],zmm6[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
-; AVX512BW-FCP-NEXT: vpermw %zmm10, %zmm6, %zmm6
+; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm31
+; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm4[0],xmm31[0],xmm4[1],xmm31[1],xmm4[2],xmm31[2],xmm4[3],xmm31[3],xmm4[4],xmm31[4],xmm4[5],xmm31[5],xmm4[6],xmm31[6],xmm4[7],xmm31[7]
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm28 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512BW-FCP-NEXT: vpshufb %xmm28, %xmm26, %xmm26
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm26[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm1
+; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0
+; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm1
+; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm26
+; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm26[0],xmm1[1],xmm26[1],xmm1[2],xmm26[2],xmm1[3],xmm26[3],xmm1[4],xmm26[4],xmm1[5],xmm26[5],xmm1[6],xmm26[6],xmm1[7],xmm26[7]
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512BW-FCP-NEXT: vpshufb %xmm30, %xmm0, %xmm0
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm2[2,3,2,3],zmm0[0,1,0,1]
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm6 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm2
+; AVX512BW-FCP-NEXT: vpshufb %ymm29, %ymm14, %ymm3
+; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %xmm3
+; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %xmm5
+; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm0, %xmm0
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,2,3],zmm0[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
+; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm2, %zmm2
; AVX512BW-FCP-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408
-; AVX512BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k2}
+; AVX512BW-FCP-NEXT: kmovq %rax, %k1
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512BW-FCP-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
-; AVX512BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm3 {%k2}
-; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm29, %xmm5
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512BW-FCP-NEXT: vpshufb %xmm18, %xmm28, %xmm20
-; AVX512BW-FCP-NEXT: vporq %xmm5, %xmm20, %xmm5
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm20, %xmm20
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm5[0,1,0,1],zmm20[0,1,0,1]
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm22, %xmm5
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm27, %xmm28
-; AVX512BW-FCP-NEXT: vporq %xmm5, %xmm28, %xmm5
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm27[8],xmm22[9],xmm27[9],xmm22[10],xmm27[10],xmm22[11],xmm27[11],xmm22[12],xmm27[12],xmm22[13],xmm27[13],xmm22[14],xmm27[14],xmm22[15],xmm27[15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm22, %xmm22
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm22[0,1,0,1]
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm23, %xmm22
-; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm19, %xmm27
-; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm27, %xmm22
-; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm23[8],xmm19[9],xmm23[9],xmm19[10],xmm23[10],xmm19[11],xmm23[11],xmm19[12],xmm23[12],xmm19[13],xmm23[13],xmm19[14],xmm23[14],xmm19[15],xmm23[15]
-; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm19, %xmm19
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm22[0,1,0,1],zmm19[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm22, %zmm2
+; AVX512BW-FCP-NEXT: kmovq %rax, %k1
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm6 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm26, %xmm2
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
+; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm24
+; AVX512BW-FCP-NEXT: vporq %xmm2, %xmm24, %xmm2
+; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm26[8],xmm1[8],xmm26[9],xmm1[9],xmm26[10],xmm1[10],xmm26[11],xmm1[11],xmm26[12],xmm1[12],xmm26[13],xmm1[13],xmm26[14],xmm1[14],xmm26[15],xmm1[15]
+; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,0,1],zmm1[0,1,0,1]
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm31, %xmm24
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm4, %xmm27
+; AVX512BW-FCP-NEXT: vporq %xmm24, %xmm27, %xmm24
+; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm31[8],xmm4[8],xmm31[9],xmm4[9],xmm31[10],xmm4[10],xmm31[11],xmm4[11],xmm31[12],xmm4[12],xmm31[13],xmm4[13],xmm31[14],xmm4[14],xmm31[15],xmm4[15]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,0,1],zmm4[0,1,0,1]
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm24 {%k2}
+; AVX512BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm4
+; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm27 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
+; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm5, %xmm31
+; AVX512BW-FCP-NEXT: vporq %xmm4, %xmm31, %xmm4
+; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,0,1],zmm3[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512BW-FCP-NEXT: vpermw %zmm16, %zmm4, %zmm4
; AVX512BW-FCP-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204
; AVX512BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm19 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
; AVX512BW-FCP-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
; AVX512BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm5 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm2
-; AVX512BW-FCP-NEXT: vpshufb %xmm18, %xmm12, %xmm6
-; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm6, %xmm2
-; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm6[0,1,0,1],zmm2[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm15, %xmm6
-; AVX512BW-FCP-NEXT: vpshufb %xmm24, %xmm14, %xmm9
-; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6
-; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,0,1],zmm6[0,1,0,1]
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm24 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm19, %xmm3
+; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm17, %xmm0
+; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm30, %xmm3, %xmm3
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,0,1],zmm0[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm21, %xmm2
+; AVX512BW-FCP-NEXT: vpshufb %xmm26, %xmm20, %xmm3
+; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm20[0],xmm21[0],xmm20[1],xmm21[1],xmm20[2],xmm21[2],xmm20[3],xmm21[3],xmm20[4],xmm21[4],xmm20[5],xmm21[5],xmm20[6],xmm21[6],xmm20[7],xmm21[7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm28, %xmm3, %xmm3
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,0,1],zmm2[0,1,0,1]
; AVX512BW-FCP-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C
; AVX512BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm6 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm16, %xmm2
-; AVX512BW-FCP-NEXT: vpshufb %xmm25, %xmm17, %xmm7
-; AVX512BW-FCP-NEXT: vpor %xmm2, %xmm7, %xmm2
-; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4
-; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,0,1],zmm2[0,1,0,1]
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
-; AVX512BW-FCP-NEXT: vpermw %zmm10, %zmm4, %zmm4
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb %xmm1, %xmm22, %xmm0
+; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm23, %xmm1
+; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
+; AVX512BW-FCP-NEXT: vpshufb %xmm29, %xmm1, %xmm1
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
+; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm1, %zmm1
; AVX512BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
; AVX512BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-FCP-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
; AVX512BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm6 {%k1}
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm11[19],zero,ymm11[21,20,21,22],zero,ymm11[20],zero,ymm11[22,23]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero,zero
+; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero,ymm11[25]
+; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm9[18,19,20,21],zero,ymm9[19],zero,ymm9[25,26,27,22],zero,ymm9[20],zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22],zero,ymm10[20]
+; AVX512BW-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm9[23],zero,ymm9[21,22,23,26],zero,ymm9[24],zero,ymm9[28,29,26,27]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero
+; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[2,3,2,3],zmm3[2,3,2,3]
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2}
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[20],zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22]
+; AVX512BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm14[23],zero,ymm14[23,24,25,26],zero,ymm14[24],zero,ymm14[30,31]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero
+; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm3[2,3,2,3]
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm3, %zmm3
+; AVX512BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
+; AVX512BW-FCP-NEXT: kmovq %rax, %k1
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512BW-FCP-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
+; AVX512BW-FCP-NEXT: kmovq %rax, %k1
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%rax)
-; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 320(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 256(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 384(%rax)
+; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax)
; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: store_i8_stride7_vf64:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm0
-; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %ymm2
-; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm14
-; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7]
-; AVX512DQ-BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14]
-; AVX512DQ-BW-NEXT: movl $338170920, %r10d # imm = 0x14281428
-; AVX512DQ-BW-NEXT: kmovd %r10d, %k2
-; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm2, %ymm1 {%k2}
-; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
-; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm2, %ymm3
-; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512DQ-BW-NEXT: vpshufb %ymm10, %ymm14, %ymm5
-; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm3[2,3,2,3],zmm1[2,3,2,3]
-; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %ymm15
-; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512DQ-BW-NEXT: vpshufb %ymm5, %ymm15, %ymm3
-; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %ymm17
-; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512DQ-BW-NEXT: # ymm11 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm17, %ymm6
-; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm6, %ymm3
-; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
-; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm17, %ymm7
-; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm15[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7]
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rax), %zmm21
+; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %ymm0
+; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm9 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm0, %ymm2
+; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %ymm1
+; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm1, %ymm3
+; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm0[18,19,20,21],zero,ymm0[19],zero,ymm0[25,26,27,22],zero,ymm0[20],zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[18],zero,zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20]
+; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
+; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %ymm3
+; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm5
+; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm6 = ymm4[0,0,1,1,4,4,5,5]
+; AVX512DQ-BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6]
; AVX512DQ-BW-NEXT: movl $676341840, %r10d # imm = 0x28502850
-; AVX512DQ-BW-NEXT: kmovd %r10d, %k3
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm8, %ymm7 {%k3}
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm7[2,3,2,3]
-; AVX512DQ-BW-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18
-; AVX512DQ-BW-NEXT: kmovq %r10, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %ymm16
-; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
-; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm16, %ymm3
-; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %ymm18
-; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
-; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm18, %ymm12
-; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm12, %ymm3
-; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm18, %ymm19
-; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512DQ-BW-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm16, %ymm20
-; AVX512DQ-BW-NEXT: vporq %ymm19, %ymm20, %ymm19
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm19[2,3,2,3],zmm3[2,3,2,3]
-; AVX512DQ-BW-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060
-; AVX512DQ-BW-NEXT: kmovq %r10, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
-; AVX512DQ-BW-NEXT: vmovdqa 32(%rax), %ymm3
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
-; AVX512DQ-BW-NEXT: vpermw %zmm3, %zmm19, %zmm19
-; AVX512DQ-BW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
-; AVX512DQ-BW-NEXT: kmovq %rax, %k4
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm19, %zmm1 {%k4}
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm19 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm15, %ymm21
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm17, %ymm22
-; AVX512DQ-BW-NEXT: vporq %ymm21, %ymm22, %ymm21
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm22 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512DQ-BW-NEXT: vpshufb %ymm22, %ymm15, %ymm15
-; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512DQ-BW-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT: vpshufb %ymm25, %ymm17, %ymm17
-; AVX512DQ-BW-NEXT: vporq %ymm15, %ymm17, %ymm15
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm15, %zmm21, %zmm15
-; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm17 = ymm14[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[0,0,1,1,4,4,5,5]
-; AVX512DQ-BW-NEXT: vpbroadcastd {{.*#+}} ymm21 = [5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6,5,4,3,6]
-; AVX512DQ-BW-NEXT: vpshufb %ymm21, %ymm2, %ymm17 {%k3}
+; AVX512DQ-BW-NEXT: kmovd %r10d, %k1
+; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm3, %ymm6 {%k1}
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
+; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
+; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm5, %ymm7
+; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm3, %ymm10
+; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm10, %ymm7
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm20
+; AVX512DQ-BW-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
+; AVX512DQ-BW-NEXT: kmovq %r10, %k3
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm2, %zmm20 {%k3}
+; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %ymm2
+; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm15 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
+; AVX512DQ-BW-NEXT: vpshufb %ymm15, %ymm2, %ymm6
+; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %ymm12
+; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm16 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
+; AVX512DQ-BW-NEXT: vpshufb %ymm16, %ymm12, %ymm10
+; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm10, %ymm6
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm17 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm12[18],zero,ymm12[20,21,20,21],zero,ymm12[19],zero,ymm12[19,20,21,22],zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm18 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22]
+; AVX512DQ-BW-NEXT: vporq %ymm17, %ymm18, %ymm17
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm17 = ymm17[2,3,2,3]
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm23 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512DQ-BW-NEXT: vpshufb %ymm23, %ymm14, %ymm14
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm2, %ymm2
-; AVX512DQ-BW-NEXT: vpor %ymm2, %ymm14, %ymm2
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm2, %zmm2
-; AVX512DQ-BW-NEXT: movabsq $3485998880071096368, %rax # imm = 0x3060C183060C1830
-; AVX512DQ-BW-NEXT: kmovq %rax, %k4
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm15, %zmm2 {%k4}
-; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm14 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
-; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm16, %ymm17
-; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512DQ-BW-NEXT: vpshufb %ymm15, %ymm18, %ymm26
-; AVX512DQ-BW-NEXT: vporq %ymm17, %ymm26, %ymm17
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512DQ-BW-NEXT: vpshufb %ymm26, %ymm18, %ymm18
-; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm28 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512DQ-BW-NEXT: # ymm28 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-NEXT: vpshufb %ymm28, %ymm16, %ymm16
-; AVX512DQ-BW-NEXT: vporq %ymm18, %ymm16, %ymm16
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm16
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm29 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512DQ-BW-NEXT: vpermw %ymm3, %ymm29, %ymm17
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm18 = ymm3[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm17, %zmm6, %zmm17
+; AVX512DQ-BW-NEXT: vmovdqa 32(%rax), %ymm8
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512DQ-BW-NEXT: vpermw %ymm8, %ymm6, %ymm18
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm19 = ymm8[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm18, %zmm19, %zmm18
; AVX512DQ-BW-NEXT: movabsq $145249953336295682, %rax # imm = 0x204081020408102
-; AVX512DQ-BW-NEXT: kmovq %rax, %k5
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm17, %zmm16 {%k5}
+; AVX512DQ-BW-NEXT: kmovq %rax, %k2
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm18, %zmm17 {%k2}
; AVX512DQ-BW-NEXT: movabsq $-4357498600088870461, %rax # imm = 0xC3870E1C3870E1C3
+; AVX512DQ-BW-NEXT: kmovq %rax, %k2
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm17, %zmm20 {%k2}
+; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm17 = ymm5[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm17 = ymm17[2,2,3,3,6,6,7,7]
+; AVX512DQ-BW-NEXT: vpbroadcastd {{.*#+}} ymm18 = [13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14,13,12,15,14]
+; AVX512DQ-BW-NEXT: movl $338170920, %eax # imm = 0x14281428
+; AVX512DQ-BW-NEXT: kmovd %eax, %k4
+; AVX512DQ-BW-NEXT: vpshufb %ymm18, %ymm3, %ymm17 {%k4}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[9],zero,ymm3[7],zero,zero,zero,zero,ymm3[10],zero,ymm3[8],zero,zero,zero,zero,ymm3[11],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[7],zero,zero,zero,zero,ymm5[10],zero,ymm5[8],zero,zero,zero,zero,ymm5[11],zero,ymm5[9],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero,ymm5[27],zero,ymm5[25]
+; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm3[2,3,2,3],zmm17[2,3,2,3]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm0[8,9],zero,ymm0[7],zero,ymm0[5,6,7,10],zero,ymm0[8],zero,ymm0[12,13,10,11,24,25],zero,ymm0[23],zero,ymm0[21,22,23,26],zero,ymm0[24],zero,ymm0[28,29,26,27]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm17 = zero,zero,ymm1[9],zero,ymm1[7],zero,zero,zero,zero,ymm1[10],zero,ymm1[8],zero,zero,zero,zero,zero,zero,ymm1[25],zero,ymm1[23],zero,zero,zero,zero,ymm1[26],zero,ymm1[24],zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vporq %ymm3, %ymm17, %ymm3
+; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm26 = [11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12,11,0,0,0,15,14,13,12]
+; AVX512DQ-BW-NEXT: vpshufb %ymm26, %ymm1, %ymm1
+; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQ-BW-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18
+; AVX512DQ-BW-NEXT: kmovq %rax, %k2
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm7 {%k2}
+; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
+; AVX512DQ-BW-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm2, %ymm0
+; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512DQ-BW-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-NEXT: vpshufb %ymm25, %ymm12, %ymm1
+; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[8,9,8,9],zero,ymm12[7],zero,ymm12[7,8,9,10],zero,ymm12[8],zero,ymm12[14,15,24,25,24,25],zero,ymm12[23],zero,ymm12[23,24,25,26],zero,ymm12[24],zero,ymm12[30,31]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[9],zero,ymm2[7],zero,zero,zero,zero,ymm2[10],zero,ymm2[8],zero,zero,zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
+; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[2,3,2,3]
+; AVX512DQ-BW-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060
+; AVX512DQ-BW-NEXT: kmovq %rax, %k2
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm7 {%k2}
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm0 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
+; AVX512DQ-BW-NEXT: vpermw %zmm8, %zmm0, %zmm0
+; AVX512DQ-BW-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
; AVX512DQ-BW-NEXT: kmovq %rax, %k5
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm16, %zmm2 {%k5}
-; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm16
-; AVX512DQ-BW-NEXT: vpshufb %ymm5, %ymm16, %ymm5
-; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %ymm17
-; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm17, %ymm11
-; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm11, %ymm5
-; AVX512DQ-BW-NEXT: vpshufb %ymm22, %ymm16, %ymm11
-; AVX512DQ-BW-NEXT: vpshufb %ymm25, %ymm17, %ymm18
-; AVX512DQ-BW-NEXT: vporq %ymm11, %ymm18, %ymm11
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm11[2,3,2,3],zmm5[2,3,2,3]
-; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm25
-; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm27
-; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm11 = ymm27[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[0,0,1,1,4,4,5,5]
-; AVX512DQ-BW-NEXT: vpshufb %ymm21, %ymm25, %ymm11 {%k3}
-; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm25, %ymm9
-; AVX512DQ-BW-NEXT: vpshufb %ymm10, %ymm27, %ymm10
-; AVX512DQ-BW-NEXT: vpor %ymm9, %ymm10, %ymm9
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm11[2,3,2,3],zmm9[2,3,2,3]
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm9, %zmm5 {%k1}
-; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %ymm18
-; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm18, %ymm9
-; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %ymm21
-; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm21, %ymm10
-; AVX512DQ-BW-NEXT: vpor %ymm9, %ymm10, %ymm9
-; AVX512DQ-BW-NEXT: vpshufb %ymm26, %ymm18, %ymm10
-; AVX512DQ-BW-NEXT: vpshufb %ymm28, %ymm21, %ymm11
-; AVX512DQ-BW-NEXT: vpor %ymm10, %ymm11, %ymm10
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[2,3,2,3],zmm9[2,3,2,3]
-; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm29, %zmm10
-; AVX512DQ-BW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
-; AVX512DQ-BW-NEXT: kmovq %rax, %k5
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm10, %zmm9 {%k5}
-; AVX512DQ-BW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
-; AVX512DQ-BW-NEXT: kmovq %rax, %k5
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm9, %zmm5 {%k5}
-; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm16, %ymm9
-; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm17, %ymm10
-; AVX512DQ-BW-NEXT: vpor %ymm9, %ymm10, %ymm9
-; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm10
-; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm19
-; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm11
-; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm20
-; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm12, %xmm12
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm12, %zmm26
-; AVX512DQ-BW-NEXT: vpshufb %ymm23, %ymm27, %ymm9
-; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm25, %ymm12
-; AVX512DQ-BW-NEXT: vpor %ymm9, %ymm12, %ymm9
-; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm12
-; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm13
-; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm23 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512DQ-BW-NEXT: vpshufb %xmm23, %xmm24, %xmm24
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm24 = ymm24[0,1,0,1]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm24, %zmm9
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm7 {%k5}
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm22
+; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm22, %ymm0
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %ymm23
+; AVX512DQ-BW-NEXT: vpshufb %ymm11, %ymm23, %ymm1
+; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm11
+; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm12
+; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm28
+; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm28, %ymm1
+; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm31
+; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm31, %ymm2
+; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm2, %ymm1
+; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm13
+; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm14
+; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm13[8],xmm14[9],xmm13[9],xmm14[10],xmm13[10],xmm14[11],xmm13[11],xmm14[12],xmm13[12],xmm14[13],xmm13[13],xmm14[14],xmm13[14],xmm14[15],xmm13[15]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm2, %xmm2
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm9
; AVX512DQ-BW-NEXT: movabsq $435749860008887046, %rax # imm = 0x60C183060C18306
; AVX512DQ-BW-NEXT: kmovq %rax, %k5
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm26, %zmm9 {%k5}
-; AVX512DQ-BW-NEXT: vpshufb %ymm14, %ymm21, %ymm14
-; AVX512DQ-BW-NEXT: vpshufb %ymm15, %ymm18, %ymm15
-; AVX512DQ-BW-NEXT: vporq %ymm14, %ymm15, %ymm24
-; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm14
-; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm15
-; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm28, %xmm28
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,1,0,1]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm24, %zmm28, %zmm24
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm28 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
-; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm28, %zmm28
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm9 {%k5}
+; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %ymm29
+; AVX512DQ-BW-NEXT: vpshufb %ymm15, %ymm29, %ymm0
+; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm6
+; AVX512DQ-BW-NEXT: vpshufb %ymm16, %ymm6, %ymm1
+; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm15
+; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %xmm16
+; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15]
+; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm10 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
+; AVX512DQ-BW-NEXT: vpermw %zmm21, %zmm1, %zmm1
; AVX512DQ-BW-NEXT: movabsq $2323999253380730912, %rax # imm = 0x2040810204081020
; AVX512DQ-BW-NEXT: kmovq %rax, %k5
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm28, %zmm24 {%k5}
-; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm28
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k5}
+; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm27
; AVX512DQ-BW-NEXT: movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38
; AVX512DQ-BW-NEXT: kmovq %rax, %k5
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm24, %zmm9 {%k5}
-; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm29
-; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm24 = ymm27[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm27 = ymm24[2,2,3,3,6,6,7,7]
-; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm25, %ymm27 {%k2}
-; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7]
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm24 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512DQ-BW-NEXT: vpshufb %xmm24, %xmm4, %xmm4
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm27 = zmm27[2,3,2,3],zmm4[0,1,0,1]
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm9 {%k5}
+; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm28[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14]
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX512DQ-BW-NEXT: vpshufb %ymm18, %ymm31, %ymm0 {%k4}
+; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm30
+; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm27[0],xmm30[0],xmm27[1],xmm30[1],xmm27[2],xmm30[2],xmm27[3],xmm30[3],xmm27[4],xmm30[4],xmm27[5],xmm30[5],xmm27[6],xmm30[6],xmm27[7],xmm30[7]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm17 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[2,3,2,3],zmm1[0,1,0,1]
+; AVX512DQ-BW-NEXT: vpshufb %ymm26, %ymm23, %ymm18
+; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm22[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,3,3,4,6,7,7]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm0, %ymm18 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %xmm5
+; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %xmm2
+; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm26 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm0, %xmm0
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm0[0,1,0,1]
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm18 {%k3}
+; AVX512DQ-BW-NEXT: vpshufb %ymm24, %ymm29, %ymm0
+; AVX512DQ-BW-NEXT: vpshufb %ymm25, %ymm6, %ymm3
+; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm3, %ymm1
; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm25
-; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm17, %ymm4
-; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm19[0],xmm20[0],xmm19[1],xmm20[1],xmm19[2],xmm20[2],xmm19[3],xmm20[3],xmm19[4],xmm20[4],xmm19[5],xmm20[5],xmm19[6],xmm20[6],xmm19[7],xmm20[7]
-; AVX512DQ-BW-NEXT: vpshufhw {{.*#+}} ymm16 = ymm16[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm16 = ymm16[0,2,3,3,4,6,7,7]
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm16, %ymm4 {%k3}
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm16 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512DQ-BW-NEXT: vpshufb %xmm16, %xmm6, %xmm6
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[2,3,2,3],zmm6[0,1,0,1]
-; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm17
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm27, %zmm4 {%k4}
-; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm21, %ymm6
-; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm18, %ymm7
-; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm7, %ymm7
-; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm17[0],xmm25[0],xmm17[1],xmm25[1],xmm17[2],xmm25[2],xmm17[3],xmm25[3],xmm17[4],xmm25[4],xmm17[5],xmm25[5],xmm17[6],xmm25[6],xmm17[7],xmm25[7]
-; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm8, %xmm8
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm7[2,3,2,3],zmm8[0,1,0,1]
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
-; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm8, %zmm8
+; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %xmm3
+; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm25[0],xmm3[1],xmm25[1],xmm3[2],xmm25[2],xmm3[3],xmm25[3],xmm3[4],xmm25[4],xmm3[5],xmm25[5],xmm3[6],xmm25[6],xmm3[7],xmm25[7]
+; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm24 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb %xmm24, %xmm0, %xmm0
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[2,3,2,3],zmm0[0,1,0,1]
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
+; AVX512DQ-BW-NEXT: vpermw %zmm21, %zmm1, %zmm1
; AVX512DQ-BW-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408
-; AVX512DQ-BW-NEXT: kmovq %rax, %k2
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm7 {%k2}
+; AVX512DQ-BW-NEXT: kmovq %rax, %k3
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3}
; AVX512DQ-BW-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
-; AVX512DQ-BW-NEXT: kmovq %rax, %k2
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm7, %zmm4 {%k2}
-; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm20, %xmm8
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512DQ-BW-NEXT: vpshufb %xmm18, %xmm19, %xmm21
-; AVX512DQ-BW-NEXT: vporq %xmm8, %xmm21, %xmm8
-; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15]
-; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm19, %xmm19
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,0,1],zmm19[0,1,0,1]
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm19 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm29, %xmm20
-; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm28, %xmm22
-; AVX512DQ-BW-NEXT: vporq %xmm20, %xmm22, %xmm20
-; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
-; AVX512DQ-BW-NEXT: vpshufb %xmm23, %xmm22, %xmm22
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[0,1,0,1],zmm22[0,1,0,1]
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm8 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm25, %xmm22
+; AVX512DQ-BW-NEXT: kmovq %rax, %k3
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm18 {%k3}
+; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm28[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm31, %ymm0 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm31[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm31[23],zero,zero,zero,zero,ymm31[26],zero,ymm31[24],zero,zero,zero,zero,ymm31[27],zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm28[23],zero,zero,zero,zero,ymm28[26],zero,ymm28[24],zero,zero,zero,zero,ymm28[27],zero,ymm28[25]
+; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm22[18,19,20,21],zero,ymm22[19],zero,ymm22[25,26,27,22],zero,ymm22[20],zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm23[18],zero,zero,zero,zero,ymm23[21],zero,ymm23[19],zero,zero,zero,zero,ymm23[22],zero,ymm23[20]
+; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm4, %ymm1
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm22[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm22[23],zero,ymm22[21,22,23,26],zero,ymm22[24],zero,ymm22[28,29,26,27]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm22 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm23[25],zero,ymm23[23],zero,zero,zero,zero,ymm23[26],zero,ymm23[24],zero,zero,zero,zero
+; AVX512DQ-BW-NEXT: vporq %ymm4, %ymm22, %ymm4
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm1[2,3,2,3],zmm4[2,3,2,3]
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm4 {%k2}
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,ymm6[20,21,20,21],zero,ymm6[19],zero,ymm6[19,20,21,22],zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm29[20],zero,ymm29[18],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22]
+; AVX512DQ-BW-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm6[23],zero,ymm6[23,24,25,26],zero,ymm6[24],zero,ymm6[30,31]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm29[25],zero,ymm29[23],zero,zero,zero,zero,ymm29[26],zero,ymm29[24],zero,zero
+; AVX512DQ-BW-NEXT: vpor %ymm1, %ymm6, %ymm1
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512DQ-BW-NEXT: vpermw %zmm21, %zmm1, %zmm1
+; AVX512DQ-BW-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
+; AVX512DQ-BW-NEXT: kmovq %rax, %k1
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512DQ-BW-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
+; AVX512DQ-BW-NEXT: kmovq %rax, %k1
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm4 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm2, %xmm1
+; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
+; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm5, %xmm22
+; AVX512DQ-BW-NEXT: vporq %xmm1, %xmm22, %xmm1
+; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,0,1],zmm2[0,1,0,1]
+; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm2 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm30, %xmm5
+; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm22 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm27, %xmm23
+; AVX512DQ-BW-NEXT: vporq %xmm5, %xmm23, %xmm5
+; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm30[8],xmm27[8],xmm30[9],xmm27[9],xmm30[10],xmm27[10],xmm30[11],xmm27[11],xmm30[12],xmm27[12],xmm30[13],xmm27[13],xmm30[14],xmm27[14],xmm30[15],xmm27[15]
+; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm23, %xmm19
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm19[0,1,0,1]
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm5 {%k2}
+; AVX512DQ-BW-NEXT: vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm25, %xmm19
; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} xmm23 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512DQ-BW-NEXT: vpshufb %xmm23, %xmm17, %xmm27
-; AVX512DQ-BW-NEXT: vporq %xmm22, %xmm27, %xmm22
-; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm25[8],xmm17[9],xmm25[9],xmm17[10],xmm25[10],xmm17[11],xmm25[11],xmm17[12],xmm25[12],xmm17[13],xmm25[13],xmm17[14],xmm25[14],xmm17[15],xmm25[15]
-; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm17, %xmm17
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm22[0,1,0,1],zmm17[0,1,0,1]
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm22 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512DQ-BW-NEXT: vpermw %zmm3, %zmm22, %zmm3
+; AVX512DQ-BW-NEXT: vpshufb %xmm23, %xmm3, %xmm27
+; AVX512DQ-BW-NEXT: vporq %xmm19, %xmm27, %xmm19
+; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm25[8],xmm3[9],xmm25[9],xmm3[10],xmm25[10],xmm3[11],xmm25[11],xmm3[12],xmm25[12],xmm3[13],xmm25[13],xmm3[14],xmm25[14],xmm3[15],xmm25[15]
+; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm19[0,1,0,1],zmm3[0,1,0,1]
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm10 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512DQ-BW-NEXT: vpermw %zmm8, %zmm10, %zmm8
; AVX512DQ-BW-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204
; AVX512DQ-BW-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm17 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm8, %zmm3 {%k1}
; AVX512DQ-BW-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
; AVX512DQ-BW-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm17, %zmm20 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm11, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb %xmm18, %xmm10, %xmm7
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm7, %xmm3
-; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm16, %xmm7, %xmm7
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm7[0,1,0,1],zmm3[0,1,0,1]
-; AVX512DQ-BW-NEXT: vpshufb %xmm19, %xmm13, %xmm7
-; AVX512DQ-BW-NEXT: vpshufb %xmm21, %xmm12, %xmm10
-; AVX512DQ-BW-NEXT: vpor %xmm7, %xmm10, %xmm7
-; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm24, %xmm10, %xmm10
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm10[0,1,0,1],zmm7[0,1,0,1]
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm5 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm12, %xmm0
+; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm11, %xmm3
+; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm3, %xmm0
+; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm26, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-BW-NEXT: vpshufb %xmm2, %xmm14, %xmm2
+; AVX512DQ-BW-NEXT: vpshufb %xmm22, %xmm13, %xmm3
+; AVX512DQ-BW-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm17, %xmm3, %xmm3
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,0,1],zmm2[0,1,0,1]
; AVX512DQ-BW-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C
; AVX512DQ-BW-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm7 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm14, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb %xmm23, %xmm15, %xmm8
-; AVX512DQ-BW-NEXT: vpor %xmm3, %xmm8, %xmm3
-; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; AVX512DQ-BW-NEXT: vpshufb %xmm6, %xmm8, %xmm6
-; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm6[0,1,0,1],zmm3[0,1,0,1]
-; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
-; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm6, %zmm0
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512DQ-BW-NEXT: vpshufb %xmm1, %xmm15, %xmm0
+; AVX512DQ-BW-NEXT: vpshufb %xmm23, %xmm16, %xmm1
+; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7]
+; AVX512DQ-BW-NEXT: vpshufb %xmm24, %xmm1, %xmm1
+; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
+; AVX512DQ-BW-NEXT: vpermw %zmm21, %zmm1, %zmm1
; AVX512DQ-BW-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
; AVX512DQ-BW-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm3 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512DQ-BW-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
; AVX512DQ-BW-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm7 {%k1}
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 256(%rax)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rax)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 320(%rax)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%rax)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rax)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 256(%rax)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%rax)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 320(%rax)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%rax)
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rax)
-; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 384(%rax)
+; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 384(%rax)
; AVX512DQ-BW-NEXT: vzeroupper
; AVX512DQ-BW-NEXT: retq
;
; AVX512DQ-BW-FCP-LABEL: store_i8_stride7_vf64:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128]
-; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %ymm11
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,128,23,128,21,22,23,26,128,24,128,28,29,26,27]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
-; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm5, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,2,3],zmm0[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm16
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm16, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm17
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
-; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm17, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm5, %ymm0
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
-; AVX512DQ-BW-FCP-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm17, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm20 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm16, %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm5[2,3,2,3],zmm0[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT: movabsq $1742999440035548184, %r10 # imm = 0x183060C183060C18
-; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %ymm18
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
-; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm18, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm22
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm22, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm12, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,24,25,24,25,128,23,128,23,24,25,26,128,24,128,30,31]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm22, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
-; AVX512DQ-BW-FCP-NEXT: # ymm13 = mem[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm18, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm14, %ymm15, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm14[2,3,2,3],zmm2[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT: movabsq $6971997760142192736, %r10 # imm = 0x60C183060C183060
-; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rax), %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm14, %zmm14
-; AVX512DQ-BW-FCP-NEXT: movabsq $-9150747060186627967, %r10 # imm = 0x8102040810204081
-; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm0 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm21 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm1, %ymm23
-; AVX512DQ-BW-FCP-NEXT: vporq %ymm15, %ymm23, %ymm15
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,128,18,19,20,21,128,19,128,25,26,27,22,128,20,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm11, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm25 = [128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20]
-; AVX512DQ-BW-FCP-NEXT: # ymm25 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm1, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm11, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm15, %zmm23
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm16, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm17, %ymm26
-; AVX512DQ-BW-FCP-NEXT: vporq %ymm1, %ymm26, %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm26 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm16, %ymm16
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm27 = [128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128]
-; AVX512DQ-BW-FCP-NEXT: # ymm27 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm17, %ymm17
-; AVX512DQ-BW-FCP-NEXT: vporq %ymm16, %ymm17, %ymm16
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm16[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm1, %zmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm18
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,0,1,14,128,14,15,0,1,14,15,128,13,14,15,16,17,16,128,30,31,30,31,16,17,128,31,28,29,30,31]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm18, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm26
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm26, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm18[18,19,20,21],zero,ymm18[19],zero,ymm18[25,26,27,22],zero,ymm18[20],zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm26[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm26[18],zero,zero,zero,zero,ymm26[21],zero,ymm26[19],zero,zero,zero,zero,ymm26[22],zero,ymm26[20]
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm28
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,14,128,12,13,0,1,14,15,128,3,12,13,2,3,16,128,30,31,28,29,16,17,128,31,18,19,28,29,18,128]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm28, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %ymm29
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm29, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm4, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm28[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm28[19],zero,ymm28[21,20,21,22],zero,ymm28[20],zero,ymm28[22,23]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm29[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm29[21],zero,ymm29[19],zero,zero,zero,zero,ymm29[22],zero,ymm29[20],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm6, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm25
; AVX512DQ-BW-FCP-NEXT: movabsq $3485998880071096368, %r10 # imm = 0x3060C183060C1830
-; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm23, %zmm1 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm16 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm18, %ymm23
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm17 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm22, %ymm28
-; AVX512DQ-BW-FCP-NEXT: vporq %ymm23, %ymm28, %ymm23
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} ymm28 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,20,128,18,128,20,21,20,21,128,19,128,19,20,21,22,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm22, %ymm22
-; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22]
-; AVX512DQ-BW-FCP-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm18, %ymm18
-; AVX512DQ-BW-FCP-NEXT: vporq %ymm22, %ymm18, %ymm18
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm18[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm23, %zmm18
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
-; AVX512DQ-BW-FCP-NEXT: vpermw %ymm2, %ymm30, %ymm22
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm23 = ymm2[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm23, %zmm22
+; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm25 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %ymm30
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm30, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %ymm31
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm31, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm31[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm31[18],zero,ymm31[20,21,20,21],zero,ymm31[19],zero,ymm31[19,20,21,22],zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm30[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm30[20],zero,ymm30[18],zero,zero,zero,zero,ymm30[21],zero,ymm30[19],zero,zero,zero,zero,ymm30[22]
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm9, %ymm13, %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm3, %zmm9
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rax), %ymm16
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512DQ-BW-FCP-NEXT: vpermw %ymm16, %ymm0, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm16[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm13, %zmm17, %zmm13
; AVX512DQ-BW-FCP-NEXT: movabsq $145249953336295682, %r10 # imm = 0x204081020408102
-; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm18 {%k3}
+; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm9 {%k2}
; AVX512DQ-BW-FCP-NEXT: movabsq $-4357498600088870461, %r10 # imm = 0xC3870E1C3870E1C3
-; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm1 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %ymm18
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm18, %ymm19
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm22
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm22, %ymm20
-; AVX512DQ-BW-FCP-NEXT: vporq %ymm19, %ymm20, %ymm19
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm22, %ymm20
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm27, %ymm18, %ymm23
-; AVX512DQ-BW-FCP-NEXT: vporq %ymm20, %ymm23, %ymm20
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm20[2,3,2,3],zmm19[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm19
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm19, %ymm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %ymm23
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm23, %ymm10
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm8, %ymm10, %ymm8
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm10
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm24, %ymm19, %ymm24
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm25, %ymm23, %ymm25
-; AVX512DQ-BW-FCP-NEXT: vporq %ymm24, %ymm25, %ymm24
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm24[2,3,2,3],zmm8[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm8 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %ymm20
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm20, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %ymm24
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm13, %ymm24, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm12, %ymm13, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm28, %ymm20, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm24, %ymm25
-; AVX512DQ-BW-FCP-NEXT: vporq %ymm13, %ymm25, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[2,3,2,3],zmm12[2,3,2,3]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm10, %zmm30, %zmm13
-; AVX512DQ-BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
-; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm12 {%k3}
-; AVX512DQ-BW-FCP-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
-; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm12, %zmm8 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm19, %ymm12
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm21, %ymm23, %ymm13
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm12, %ymm13, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm12
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm13
-; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm25 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm25, %xmm25
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm25 = ymm25[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm25, %zmm26
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm22, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm18, %ymm14
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm11, %ymm14, %ymm11
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm14
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm15
-; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm27 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm27, %xmm27
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm27 = ymm27[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm27, %zmm11
-; AVX512DQ-BW-FCP-NEXT: movabsq $435749860008887046, %rax # imm = 0x60C183060C18306
-; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm26, %zmm11 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm16, %ymm24, %ymm16
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm17, %ymm20, %ymm17
-; AVX512DQ-BW-FCP-NEXT: vporq %ymm16, %ymm17, %ymm27
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %xmm16
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %xmm17
-; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm28 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm28, %xmm28
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm28 = ymm28[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm27, %zmm28, %zmm27
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm28 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm10, %zmm28, %zmm28
+; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm25 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm9
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm13
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm10
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm13, %ymm11, %ymm11
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm17
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm19
+; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm19[8],xmm17[8],xmm19[9],xmm17[9],xmm19[10],xmm17[10],xmm19[11],xmm17[11],xmm19[12],xmm17[12],xmm19[13],xmm17[13],xmm19[14],xmm17[14],xmm19[15],xmm17[15]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm27 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm13, %xmm13
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm13, %zmm22
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm11
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm13
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm12, %ymm13, %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm8, %ymm12, %ymm8
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm20
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm21
+; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm12, %zmm8
+; AVX512DQ-BW-FCP-NEXT: movabsq $435749860008887046, %r10 # imm = 0x60C183060C18306
+; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm8 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm12
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm14, %ymm12, %ymm22
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm14
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm15, %ymm14, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vporq %ymm22, %ymm15, %ymm15
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %xmm22
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %xmm23
+; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rax), %zmm15
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6,6,1,6,0,7,1,0,7,14,9,14,8,15,9,8,15]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm2, %zmm2
; AVX512DQ-BW-FCP-NEXT: movabsq $2323999253380730912, %rax # imm = 0x2040810204081020
-; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm28, %zmm27 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm28
+; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k2}
; AVX512DQ-BW-FCP-NEXT: movabsq $4066998693416279096, %rax # imm = 0x3870E1C3870E1C38
+; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm8 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128]
+; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm26, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128]
+; AVX512DQ-BW-FCP-NEXT: # ymm0 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm18, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm18[8,9],zero,ymm18[7],zero,ymm18[5,6,7,10],zero,ymm18[8],zero,ymm18[12,13,10,11,24,25],zero,ymm18[23],zero,ymm18[21,22,23,26],zero,ymm18[24],zero,ymm18[28,29,26,27]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = zero,zero,ymm26[9],zero,ymm26[7],zero,zero,zero,zero,ymm26[10],zero,ymm26[8],zero,zero,zero,zero,zero,zero,ymm26[25],zero,ymm26[23],zero,zero,zero,zero,ymm26[26],zero,ymm26[24],zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %ymm3, %ymm18, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[2,3,2,3],zmm2[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
+; AVX512DQ-BW-FCP-NEXT: # ymm3 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm28, %ymm18
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm26 = [128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,128,128]
+; AVX512DQ-BW-FCP-NEXT: # ymm26 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm29, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vporq %ymm18, %ymm4, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm29[9],zero,ymm29[7],zero,zero,zero,zero,ymm29[10],zero,ymm29[8],zero,zero,zero,zero,ymm29[11],zero,ymm29[25],zero,ymm29[23],zero,zero,zero,zero,ymm29[26],zero,ymm29[24],zero,zero,zero,zero,ymm29[27],zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm28 = zero,ymm28[7],zero,zero,zero,zero,ymm28[10],zero,ymm28[8],zero,zero,zero,zero,ymm28[11],zero,ymm28[9],zero,ymm28[23],zero,zero,zero,zero,ymm28[26],zero,ymm28[24],zero,zero,zero,zero,ymm28[27],zero,ymm28[25]
+; AVX512DQ-BW-FCP-NEXT: vporq %ymm18, %ymm28, %ymm18
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm18[2,3,2,3],zmm4[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT: movabsq $1742999440035548184, %rax # imm = 0x183060C183060C18
+; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm18 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128]
+; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm30, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm29 = [128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29,128,27,128,128,128,128,30,128,28,128,128,128,128,31,128,29]
+; AVX512DQ-BW-FCP-NEXT: # ymm29 = mem[0,1,2,3,0,1,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm31, %ymm28
+; AVX512DQ-BW-FCP-NEXT: vporq %ymm4, %ymm28, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm28 = ymm31[8,9,8,9],zero,ymm31[7],zero,ymm31[7,8,9,10],zero,ymm31[8],zero,ymm31[14,15,24,25,24,25],zero,ymm31[23],zero,ymm31[23,24,25,26],zero,ymm31[24],zero,ymm31[30,31]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm30 = zero,zero,zero,zero,ymm30[9],zero,ymm30[7],zero,zero,zero,zero,ymm30[10],zero,ymm30[8],zero,zero,zero,zero,zero,zero,ymm30[25],zero,ymm30[23],zero,zero,zero,zero,ymm30[26],zero,ymm30[24],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vporq %ymm28, %ymm30, %ymm28
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm28[2,3,2,3],zmm4[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT: movabsq $6971997760142192736, %rax # imm = 0x60C183060C183060
+; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm18 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12,14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm16, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT: movabsq $-9150747060186627967, %rax # imm = 0x8102040810204081
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm27, %zmm11 {%k3}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm27
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm22, %ymm7
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm22
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm18, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm7, %ymm9, %ymm9
-; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm27[0],xmm22[0],xmm27[1],xmm22[1],xmm27[2],xmm22[2],xmm27[3],xmm22[3],xmm27[4],xmm22[4],xmm27[5],xmm22[5],xmm27[6],xmm22[6],xmm27[7],xmm22[7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm18, %xmm18
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm18 = zmm9[2,3,2,3],zmm18[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm29
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm23, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm23
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm19, %ymm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm18 {%k3}
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm26, %ymm13, %ymm4
; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
-; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm28[0],xmm29[0],xmm28[1],xmm29[1],xmm28[2],xmm29[2],xmm28[3],xmm29[3],xmm28[4],xmm29[4],xmm28[5],xmm29[5],xmm28[6],xmm29[6],xmm28[7],xmm29[7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm9 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm4, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm4[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm19
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm3 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm24, %ymm4
-; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm20, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm5, %ymm5
-; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm19[0],xmm23[0],xmm19[1],xmm23[1],xmm19[2],xmm23[2],xmm19[3],xmm23[3],xmm19[4],xmm23[4],xmm19[5],xmm23[5],xmm19[6],xmm23[6],xmm19[7],xmm23[7]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[2,3,2,3],zmm6[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm10, %zmm6, %zmm6
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm31
+; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm4[0],xmm31[0],xmm4[1],xmm31[1],xmm4[2],xmm31[2],xmm4[3],xmm31[3],xmm4[4],xmm31[4],xmm4[5],xmm31[5],xmm4[6],xmm31[6],xmm4[7],xmm31[7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm28 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm28, %xmm26, %xmm26
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm3[2,3,2,3],zmm26[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm10, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm9, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm0, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm26
+; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm26[0],xmm1[1],xmm26[1],xmm1[2],xmm26[2],xmm1[3],xmm26[3],xmm1[4],xmm26[4],xmm1[5],xmm26[5],xmm1[6],xmm26[6],xmm1[7],xmm26[7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm30 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm30, %xmm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm2[2,3,2,3],zmm0[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm29, %ymm14, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %xmm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %xmm5
+; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm29 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[2,3,2,3],zmm0[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [14,13,14,15,15,14,14,15,14,13,14,15,15,14,14,15,17,17,16,16,17,17,16,16,20,21,17,17,17,17,16,16]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm2, %zmm2
; AVX512DQ-BW-FCP-NEXT: movabsq $580999813345182728, %rax # imm = 0x810204081020408
-; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k2}
+; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm0 {%k1}
; AVX512DQ-BW-FCP-NEXT: movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
-; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm5, %zmm3 {%k2}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm29, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm18 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm18, %xmm28, %xmm20
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm5, %xmm20, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm20, %xmm20
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm5[0,1,0,1],zmm20[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm22, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm24 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm27, %xmm28
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm5, %xmm28, %xmm5
-; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm27[8],xmm22[9],xmm27[9],xmm22[10],xmm27[10],xmm22[11],xmm27[11],xmm22[12],xmm27[12],xmm22[13],xmm27[13],xmm22[14],xmm27[14],xmm22[15],xmm27[15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm22, %xmm22
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,0,1],zmm22[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm20, %zmm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm20 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm23, %xmm22
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm25 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm19, %xmm27
-; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm27, %xmm22
-; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm23[8],xmm19[9],xmm23[9],xmm19[10],xmm23[10],xmm19[11],xmm23[11],xmm19[12],xmm23[12],xmm19[13],xmm23[13],xmm19[14],xmm23[14],xmm19[15],xmm23[15]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm19, %xmm19
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 = zmm22[0,1,0,1],zmm19[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm22 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm22, %zmm2
+; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm26, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm24
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm2, %xmm24, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm26[8],xmm1[8],xmm26[9],xmm1[9],xmm26[10],xmm1[10],xmm26[11],xmm1[11],xmm26[12],xmm1[12],xmm26[13],xmm1[13],xmm26[14],xmm1[14],xmm26[15],xmm1[15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,0,1],zmm1[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm31, %xmm24
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm26 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm4, %xmm27
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm24, %xmm27, %xmm24
+; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm31[8],xmm4[8],xmm31[9],xmm4[9],xmm31[10],xmm4[10],xmm31[11],xmm4[11],xmm31[12],xmm4[12],xmm31[13],xmm4[13],xmm31[14],xmm4[14],xmm31[15],xmm4[15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,0,1],zmm4[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm24 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm3, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm27 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm5, %xmm31
+; AVX512DQ-BW-FCP-NEXT: vporq %xmm4, %xmm31, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm4[0,1,0,1],zmm3[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,4,5,4,5,5,6,5,6,4,5,4,5,5,6,5,6]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm16, %zmm4, %zmm4
; AVX512DQ-BW-FCP-NEXT: movabsq $290499906672591364, %rax # imm = 0x408102040810204
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm19 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm3 {%k1}
; AVX512DQ-BW-FCP-NEXT: movabsq $-8714997200177740921, %rax # imm = 0x870E1C3870E1C387
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm5 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm18, %xmm12, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm6, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3],xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm6, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm6[0,1,0,1],zmm2[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm15, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm24, %xmm14, %xmm9
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm9, %xmm6
-; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm7[0,1,0,1],zmm6[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm24 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm19, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm17, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm30, %xmm3, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm3[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm21, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm26, %xmm20, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm20[0],xmm21[0],xmm20[1],xmm21[1],xmm20[2],xmm21[2],xmm20[3],xmm21[3],xmm20[4],xmm21[4],xmm20[5],xmm21[5],xmm20[6],xmm21[6],xmm20[7],xmm21[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm28, %xmm3, %xmm3
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,0,1],zmm2[0,1,0,1]
; AVX512DQ-BW-FCP-NEXT: movabsq $871499720017774092, %rax # imm = 0xC183060C183060C
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm6 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm16, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm25, %xmm17, %xmm7
-; AVX512DQ-BW-FCP-NEXT: vpor %xmm2, %xmm7, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm4, %xmm7, %xmm4
-; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm4[0,1,0,1],zmm2[0,1,0,1]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
-; AVX512DQ-BW-FCP-NEXT: vpermw %zmm10, %zmm4, %zmm4
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm1, %xmm22, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm23, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm29, %xmm1, %xmm1
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[0,1,0,1]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [1,1,0,0,1,1,0,0,4,5,1,1,1,1,0,0,2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm1, %zmm1
; AVX512DQ-BW-FCP-NEXT: movabsq $4647998506761461824, %rax # imm = 0x4081020408102040
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512DQ-BW-FCP-NEXT: movabsq $8133997386832558192, %rax # imm = 0x70E1C3870E1C3870
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm2, %zmm6 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21],zero,ymm11[19],zero,ymm11[21,20,21,22],zero,ymm11[20],zero,ymm11[22,23]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27],zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm11[23],zero,zero,zero,zero,ymm11[26],zero,ymm11[24],zero,zero,zero,zero,ymm11[27],zero,ymm11[25]
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,ymm9[18,19,20,21],zero,ymm9[19],zero,ymm9[25,26,27,22],zero,ymm9[20],zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm10[18],zero,zero,zero,zero,ymm10[21],zero,ymm10[19],zero,zero,zero,zero,ymm10[22],zero,ymm10[20]
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25],zero,ymm9[23],zero,ymm9[21,22,23,26],zero,ymm9[24],zero,ymm9[28,29,26,27]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm10[25],zero,ymm10[23],zero,zero,zero,zero,ymm10[26],zero,ymm10[24],zero,zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[2,3,2,3],zmm3[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k2}
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm14[18],zero,ymm14[20,21,20,21],zero,ymm14[19],zero,ymm14[19,20,21,22],zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[20],zero,ymm12[18],zero,zero,zero,zero,ymm12[21],zero,ymm12[19],zero,zero,zero,zero,ymm12[22]
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25],zero,ymm14[23],zero,ymm14[23,24,25,26],zero,ymm14[24],zero,ymm14[30,31]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm3[2,3,2,3]
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [10,9,9,10,10,9,9,10,9,10,14,15,10,9,9,10,11,13,12,11,12,13,13,12,11,13,12,11,12,13,13,12]
+; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm3, %zmm3
+; AVX512DQ-BW-FCP-NEXT: movabsq $1161999626690365456, %rax # imm = 0x1020408102040810
+; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm3, %zmm0 {%k1}
+; AVX512DQ-BW-FCP-NEXT: movabsq $2033499346708139548, %rax # imm = 0x1C3870E1C3870E1C
+; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 128(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 320(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 256(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 384(%rax)
-; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 320(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 256(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 384(%rax)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%rax)
; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index a03b03e120e88..7e3c534849b5b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -2946,182 +2946,189 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX2-LABEL: store_i8_stride8_vf32:
; AVX2: # %bb.0:
-; AVX2-NEXT: subq $40, %rsp
+; AVX2-NEXT: subq $72, %rsp
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX2-NEXT: vmovdqa (%r10), %xmm3
; AVX2-NEXT: vmovdqa (%rax), %xmm4
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: vmovdqa (%r9), %xmm6
-; AVX2-NEXT: vmovdqa (%r8), %xmm0
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,1,1,3,4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[2,1,3,3,4,5,6,7]
+; AVX2-NEXT: vmovdqa (%r8), %xmm8
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,1,1,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm5, %ymm5
-; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15]
; AVX2-NEXT: vmovdqa (%rsi), %xmm5
-; AVX2-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm9 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero
-; AVX2-NEXT: vmovdqa (%rcx), %xmm8
-; AVX2-NEXT: vmovdqa (%rdx), %xmm10
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm15 = xmm15[0],zero,xmm15[1],zero,xmm15[2],zero,xmm15[3],zero
-; AVX2-NEXT: vpblendw {{.*#+}} ymm15 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7,8],ymm15[9],ymm9[10,11,12],ymm15[13],ymm9[14,15]
-; AVX2-NEXT: vmovaps 16(%r10), %xmm7
-; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[0,0,2,1,4,4,6,5]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0],ymm12[1],ymm15[2],ymm12[3],ymm15[4],ymm12[5],ymm15[6],ymm12[7]
-; AVX2-NEXT: vmovdqu %ymm7, (%rsp) # 32-byte Spill
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,4,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,5,5,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,7,7]
-; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm12, %ymm11
-; AVX2-NEXT: vmovdqa 16(%rax), %xmm12
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0,1,2,3,4],ymm2[5],ymm11[6],ymm2[7],ymm11[8,9,10,11,12],ymm2[13],ymm11[14],ymm2[15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,4,4,6,5]
+; AVX2-NEXT: vmovdqa (%rdi), %xmm7
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm9 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero
+; AVX2-NEXT: vmovdqa (%rcx), %xmm10
+; AVX2-NEXT: vmovdqa (%rdx), %xmm11
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm13 = xmm14[0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm13 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero
+; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm13[1],ymm9[2,3,4],ymm13[5],ymm9[6,7,8],ymm13[9],ymm9[10,11,12],ymm13[13],ymm9[14,15]
+; AVX2-NEXT: vmovdqa 16(%r10), %xmm15
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm1[1],ymm13[2],ymm1[3],ymm13[4],ymm1[5],ymm13[6],ymm1[7]
+; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm0[0,1,2,3,4,4,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm13, %ymm0
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm2[0,1,2,3,4,5,5,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm13, %ymm2
+; AVX2-NEXT: vmovdqa 16(%rax), %xmm1
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm0[5],ymm2[6],ymm0[7],ymm2[8,9,10,11,12],ymm0[13],ymm2[14],ymm0[15]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,4,6,5]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm11, %ymm11
-; AVX2-NEXT: vmovdqa 16(%r9), %xmm14
-; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,1,3,3,6,5,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[2,3,2,3]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero
-; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7,8],ymm11[9],ymm13[10,11,12],ymm11[13],ymm13[14,15]
-; AVX2-NEXT: vmovdqa 16(%r8), %xmm15
+; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0
+; AVX2-NEXT: vmovaps 16(%r9), %xmm9
+; AVX2-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[2,3,2,3]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero
+; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm12[0],ymm0[1],ymm12[2,3,4],ymm0[5],ymm12[6,7,8],ymm0[9],ymm12[10,11,12],ymm0[13],ymm12[14,15]
+; AVX2-NEXT: vmovdqa 16(%r8), %xmm14
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7]
-; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
-; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm2[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm4, %ymm11
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,4,5,5,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,6,5,7,7]
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5],ymm0[6],ymm11[7],ymm0[8,9,10,11,12],ymm11[13],ymm0[14],ymm11[15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
-; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,4,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm2[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm8, %ymm8
-; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
-; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm2[1],ymm12[2],ymm2[3],ymm12[4],ymm2[5],ymm12[6],ymm2[7]
+; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
+; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,4,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm4, %ymm12
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,4,5,5,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm13[0,1,2,3,6,5,7,7]
+; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6
+; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5],ymm6[6],ymm12[7],ymm6[8,9,10,11,12],ymm12[13],ymm6[14],ymm12[15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm6[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
+; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,4,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,6,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm10, %ymm10
+; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,1,3,3,6,5,7,7]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
+; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7,8],ymm8[9],ymm5[10,11,12],ymm8[13],ymm5[14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm10[1],ymm5[2,3,4],ymm10[5],ymm5[6,7,8],ymm10[9],ymm5[10,11,12],ymm10[13],ymm5[14,15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm8[1],ymm5[2],ymm8[3],ymm5[4],ymm8[5],ymm5[6],ymm8[7]
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
+; AVX2-NEXT: vmovdqa %xmm1, %xmm9
+; AVX2-NEXT: vmovdqa %xmm15, %xmm7
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,4,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,6,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm8, %ymm8
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3],xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,4,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm10[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm11[0,1,2,3,4,5,5,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm13 = xmm11[0,1,2,3,6,5,7,7]
-; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm8, %ymm8
-; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm0[5],ymm8[6],ymm0[7],ymm8[8,9,10,11,12],ymm0[13],ymm8[14],ymm0[15]
-; AVX2-NEXT: vmovdqa 16(%rcx), %xmm7
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3],xmm14[4],xmm3[4],xmm14[5],xmm3[5],xmm14[6],xmm3[6],xmm14[7],xmm3[7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm11[0,1,2,3,4,5,5,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm11[0,1,2,3,6,5,7,7]
+; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm12, %ymm12
+; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0,1,2,3,4],ymm8[5],ymm12[6],ymm8[7],ymm12[8,9,10,11,12],ymm8[13],ymm12[14],ymm8[15]
+; AVX2-NEXT: vmovdqa 16(%rcx), %xmm6
; AVX2-NEXT: vmovdqa 16(%rdx), %xmm4
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,4,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[2,1,3,3,6,5,7,7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,4,4,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,6,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm15, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
; AVX2-NEXT: vmovdqa 16(%rsi), %xmm1
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm0
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[2,3,2,3]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm12[8],xmm3[8],xmm12[9],xmm3[9],xmm12[10],xmm3[10],xmm12[11],xmm3[11],xmm12[12],xmm3[12],xmm12[13],xmm3[13],xmm12[14],xmm3[14],xmm12[15],xmm3[15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,4,4,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm6, %ymm6
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,5,5,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm15 = xmm12[0,1,2,3,6,5,7,7]
-; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm14, %ymm14
-; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5],ymm14[6],ymm6[7],ymm14[8,9,10,11,12],ymm6[13],ymm14[14],ymm6[15]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm15[2,3,2,3]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[2,3,4],ymm2[5],ymm5[6,7,8],ymm2[9],ymm5[10,11,12],ymm2[13],ymm5[14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm8[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2],ymm5[3],ymm2[4],ymm5[5],ymm2[6],ymm5[7]
+; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,4,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,4,6,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm14[8],xmm3[8],xmm14[9],xmm3[9],xmm14[10],xmm3[10],xmm14[11],xmm3[11],xmm14[12],xmm3[12],xmm14[13],xmm3[13],xmm14[14],xmm3[14],xmm14[15],xmm3[15]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm8[0,1,2,3,4,5,5,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm8[0,1,2,3,6,5,7,7]
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm14, %ymm7
+; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6],ymm5[7],ymm7[8,9,10,11,12],ymm5[13],ymm7[14],ymm5[15]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm3[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm4, %ymm4
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm3[0,1,2,3,4,6,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm4, %ymm4
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[2,1,3,3,6,5,7,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7,8],ymm0[9],ymm4[10,11,12],ymm0[13],ymm4[14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2],ymm4[3],ymm0[4],ymm4[5],ymm0[6],ymm4[7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,0,2,1,4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[2,1,3,3,4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,1,3,4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[2,1,3,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
-; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5,6,7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13,14,15]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4],ymm3[5],ymm1[6,7,8],ymm3[9],ymm1[10,11,12],ymm3[13],ymm1[14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,0,2,1,4,4,6,5]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,0,2,1,4,5,6,7]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,0,2,1,4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,1,3,4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4,5,6,7,8],ymm1[9],ymm3[10],ymm1[11],ymm3[12,13,14,15]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT: vpshufb %xmm5, %xmm13, %xmm3
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,1,3,4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[2,1,3,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5,6,7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13,14,15]
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
-; AVX2-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 16-byte Folded Reload
-; AVX2-NEXT: # ymm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,0,2,1,4,5,6,7]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero
; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7,8],ymm3[9],ymm4[10,11,12],ymm3[13],ymm4[14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,1,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[2,1,3,3,4,5,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
+; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5,6,7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13,14,15]
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
+; AVX2-NEXT: vpmovzxwq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
+; AVX2-NEXT: # ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7,8],ymm4[9],ymm5[10,11,12],ymm4[13],ymm5[14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7]
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: vmovdqa %ymm2, 64(%rax)
-; AVX2-NEXT: vmovdqa %ymm1, 128(%rax)
-; AVX2-NEXT: vmovdqa %ymm12, 192(%rax)
+; AVX2-NEXT: vmovdqa %ymm3, 64(%rax)
+; AVX2-NEXT: vmovdqa %ymm2, 128(%rax)
+; AVX2-NEXT: vmovdqa %ymm1, 192(%rax)
; AVX2-NEXT: vmovdqa %ymm0, 224(%rax)
-; AVX2-NEXT: vmovdqa %ymm8, 160(%rax)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, 96(%rax)
+; AVX2-NEXT: vmovaps %ymm0, 160(%rax)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
+; AVX2-NEXT: vmovaps %ymm0, 96(%rax)
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
+; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
+; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, (%rax)
-; AVX2-NEXT: addq $40, %rsp
+; AVX2-NEXT: addq $72, %rsp
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -3434,7 +3441,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovdqa (%rax), %xmm5
; AVX512-NEXT: vmovdqa 16(%rax), %xmm12
; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm22
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm24
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
@@ -3462,8 +3469,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vmovdqa (%rsi), %xmm1
; AVX512-NEXT: vmovdqa (%rdi), %xmm2
; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX512-NEXT: vmovdqa64 %xmm2, %xmm20
-; AVX512-NEXT: vmovdqa64 %xmm1, %xmm21
+; AVX512-NEXT: vmovdqa64 %xmm2, %xmm22
+; AVX512-NEXT: vmovdqa64 %xmm1, %xmm23
; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; AVX512-NEXT: vmovdqa (%rcx), %xmm8
; AVX512-NEXT: vmovdqa (%rdx), %xmm9
@@ -3506,18 +3513,20 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
; AVX512-NEXT: vinserti32x4 $1, %xmm3, %ymm1, %ymm19
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm0[0],xmm3[0]
; AVX512-NEXT: vmovdqa 16(%rsi), %xmm3
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm13
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm3[0],xmm13[1],xmm3[1],xmm13[2],xmm3[2],xmm13[3],xmm3[3],xmm13[4],xmm3[4],xmm13[5],xmm3[5],xmm13[6],xmm3[6],xmm13[7],xmm3[7]
-; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX512-NEXT: vpshufd {{.*#+}} xmm20 = xmm1[2,3,2,3]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm19[2,1,3,3,6,5,7,7]
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero
; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7,8],ymm0[9],ymm4[10,11,12],ymm0[13],ymm4[14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm19[2,1,3,3,6,5,7,7]
; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm21[0],zero,xmm21[1],zero,xmm21[2],zero,xmm21[3],zero
; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7,8],ymm4[9],ymm1[10,11,12],ymm4[13],ymm1[14,15]
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm19
+; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm19
; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm12[8],xmm11[8],xmm12[9],xmm11[9],xmm12[10],xmm11[10],xmm12[11],xmm11[11],xmm12[12],xmm11[12],xmm12[13],xmm11[13],xmm12[14],xmm11[14],xmm12[15],xmm11[15]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm0[0,1,2,3,4,6,6,7]
@@ -3543,17 +3552,19 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7]
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
; AVX512-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm13[8],xmm3[8],xmm13[9],xmm3[9],xmm13[10],xmm3[10],xmm13[11],xmm3[11],xmm13[12],xmm3[12],xmm13[13],xmm3[13],xmm13[14],xmm3[14],xmm13[15],xmm3[15]
-; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15]
; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7,8],ymm0[9],ymm3[10,11,12],ymm0[13],ymm3[14,15]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15]
+; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15]
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vmovdqa64 %xmm22, %xmm1
+; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,6,5]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7]
@@ -3579,8 +3590,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5]
; AVX512-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,6,6,7]
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX512-NEXT: vmovdqa64 %xmm20, %xmm4
-; AVX512-NEXT: vmovdqa64 %xmm21, %xmm5
+; AVX512-NEXT: vmovdqa64 %xmm22, %xmm4
+; AVX512-NEXT: vmovdqa64 %xmm23, %xmm5
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
@@ -3778,8 +3789,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vmovdqa (%rax), %xmm2
; AVX512DQ-NEXT: vmovdqa 16(%rax), %xmm10
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm22
-; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm23
+; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm23
+; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm24
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,6,6,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
@@ -3811,8 +3822,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm14 = xmm12[0,1,2,3,4,6,6,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm13, %ymm13
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm24
-; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm25
+; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm25
+; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm26
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3]
; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm12 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero
@@ -3842,21 +3853,23 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,1,3,4,5,6,7]
; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7]
; AVX512DQ-NEXT: vinserti32x4 $1, %xmm5, %ymm0, %ymm20
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,4,6,5]
-; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm5[0,1,2,3,4,6,6,7]
-; AVX512DQ-NEXT: vinserti32x4 $1, %xmm12, %ymm0, %ymm21
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3],xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,4,6,5]
+; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm12 = xmm0[0,1,2,3,4,6,6,7]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm12, %ymm5, %ymm12
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm21 = xmm0[0],xmm5[0]
; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
-; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[2,3,2,3]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[2,1,3,3,6,5,7,7]
-; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm22 = xmm0[2,3,2,3]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7]
+; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero,xmm22[2],zero,zero,zero,xmm22[3],zero,zero,zero
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7,8],ymm12[9],ymm2[10,11,12],ymm12[13],ymm2[14,15]
+; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm12 = xmm21[0],zero,xmm21[1],zero,xmm21[2],zero,xmm21[3],zero
+; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7,8],ymm12[9],ymm0[10,11,12],ymm12[13],ymm0[14,15]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7]
@@ -3883,25 +3896,27 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm10[0,1,2,3,4,4,6,5]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm13 = xmm10[0,1,2,3,4,6,6,7]
; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm11, %ymm11
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm10[0,2,2,3,4,5,6,7]
+; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,0,2,1,4,5,6,7]
+; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm13[0]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[2,1,3,3,6,5,7,7]
; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
-; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0,0,1,1,2,2,3,3]
-; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0],ymm9[1],ymm5[2,3,4],ymm9[5],ymm5[6,7,8],ymm9[9],ymm5[10,11,12],ymm9[13],ymm5[14,15]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm11[2,1,3,3,6,5,7,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX512DQ-NEXT: vpmovzxdq {{.*#+}} ymm9 = xmm10[0],zero,xmm10[1],zero,xmm10[2],zero,xmm10[3],zero
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7,8],ymm9[9],ymm1[10,11,12],ymm9[13],ymm1[14,15]
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm5, %zmm1
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6],ymm0[7],ymm3[8,9,10],ymm0[11],ymm3[12,13,14],ymm0[15]
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm2, %zmm0, %zmm1 {%k1}
-; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0
-; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm2
+; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm0
+; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm2
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5]
; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,6,6,7]
@@ -3923,8 +3938,8 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7],ymm4[8,9,10],ymm2[11],ymm4[12,13,14],ymm2[15]
-; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm4
-; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm7
+; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm4
+; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm7
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3]
@@ -5679,48 +5694,48 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX2-LABEL: store_i8_stride8_vf64:
; AVX2: # %bb.0:
-; AVX2-NEXT: subq $328, %rsp # imm = 0x148
+; AVX2-NEXT: subq $296, %rsp # imm = 0x128
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: vmovdqa (%r10), %xmm0
-; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovdqa 48(%r10), %xmm11
-; AVX2-NEXT: vmovdqa (%rax), %xmm1
+; AVX2-NEXT: vmovdqa (%r10), %xmm1
; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: vmovdqa 48(%r10), %xmm0
+; AVX2-NEXT: vmovdqa (%rax), %xmm2
+; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,6,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa (%r9), %xmm0
-; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovdqa (%r9), %xmm4
+; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vmovdqa (%r8), %xmm3
; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,6,5,7,7]
; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm4, %ymm4
; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6],ymm1[7],ymm4[8,9,10,11,12],ymm1[13],ymm4[14],ymm1[15]
-; AVX2-NEXT: vmovdqa (%rcx), %xmm0
-; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovdqa (%rdx), %xmm1
+; AVX2-NEXT: vmovdqa (%rcx), %xmm1
; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX2-NEXT: vmovdqa (%rdx), %xmm4
+; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,4,6,5]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,6,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
-; AVX2-NEXT: vmovdqa (%rsi), %xmm0
-; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovdqa (%rsi), %xmm7
+; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vmovdqa (%rdi), %xmm6
; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm1[1],ymm7[2,3,4],ymm1[5],ymm7[6,7,8],ymm1[9],ymm7[10,11,12],ymm1[13],ymm7[14,15]
; AVX2-NEXT: vmovdqa 48(%rax), %xmm1
; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4],ymm5[5],ymm7[6],ymm5[7]
+; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[0,0,2,1,4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm5, %ymm2
@@ -5730,35 +5745,37 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5,6,7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
-; AVX2-NEXT: vpshufb %xmm14, %xmm4, %xmm4
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm13, %xmm4, %xmm4
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7,8],ymm4[9],ymm3[10,11,12],ymm4[13],ymm3[14,15]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm11[8],xmm1[9],xmm11[9],xmm1[10],xmm11[10],xmm1[11],xmm11[11],xmm1[12],xmm11[12],xmm1[13],xmm11[13],xmm1[14],xmm11[14],xmm1[15],xmm11[15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7]
+; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[0,0,2,1,4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT: vmovdqa 48(%r9), %xmm3
-; AVX2-NEXT: vmovdqa 48(%r8), %xmm5
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,1,3,4,5,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX2-NEXT: vmovdqa 48(%r9), %xmm2
+; AVX2-NEXT: vmovdqa 48(%r8), %xmm3
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,1,3,4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[2,1,3,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm4, %ymm4
-; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5,6,7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13,14,15]
-; AVX2-NEXT: vmovdqa 48(%rcx), %xmm2
-; AVX2-NEXT: vmovdqa 48(%rdx), %xmm4
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; AVX2-NEXT: vpshufb %xmm14, %xmm12, %xmm6
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm13 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm5, %ymm5
+; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5,6,7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13,14,15]
+; AVX2-NEXT: vmovdqa 48(%rcx), %xmm4
+; AVX2-NEXT: vmovdqa 48(%rdx), %xmm5
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm11[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm11[0,0,2,1,4,5,6,7]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm7[0],xmm6[0]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
; AVX2-NEXT: vmovdqa 48(%rsi), %xmm6
; AVX2-NEXT: vmovdqa 48(%rdi), %xmm7
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7,8],ymm13[9],ymm15[10,11,12],ymm13[13],ymm15[14,15]
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm14 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero,xmm15[2],zero,zero,zero,xmm15[3],zero,zero,zero
+; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7,8],ymm12[9],ymm14[10,11,12],ymm12[13],ymm14[14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2],ymm10[3],ymm13[4],ymm10[5],ymm13[6],ymm10[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7]
; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,4,6,5]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
@@ -5767,28 +5784,30 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,7,7]
; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm10, %ymm9
; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6],ymm8[7],ymm9[8,9,10,11,12],ymm8[13],ymm9[14],ymm8[15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,4,4,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm12[0,1,2,3,4,6,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm9 = xmm11[0,1,2,3,4,4,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9
; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3,4],ymm9[5],ymm0[6,7,8],ymm9[9],ymm0[10,11,12],ymm9[13],ymm0[14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[2,3,2,3]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
+; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7,8],ymm9[9],ymm10[10,11,12],ymm9[13],ymm10[14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2],ymm8[3],ymm0[4],ymm8[5],ymm0[6],ymm8[7]
-; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4],ymm8[5],ymm9[6],ymm8[7]
+; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[2,1,3,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm5, %ymm5
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,1,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm3, %ymm3
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4,5,6,7,8],ymm1[9],ymm3[10],ymm1[11],ymm3[12,13,14,15]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,0,2,1,4,5,6,7]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm5[0],xmm4[0]
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX2-NEXT: vpshufb %xmm14, %xmm2, %xmm4
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15]
@@ -5798,12 +5817,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5,5,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,5,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6],ymm0[7],ymm1[8,9,10,11,12],ymm0[13],ymm1[14],ymm0[15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,4,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7]
; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[2,3,2,3]
@@ -5828,12 +5847,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vmovdqa 32(%rcx), %xmm2
; AVX2-NEXT: vmovdqa 32(%rdx), %xmm4
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; AVX2-NEXT: vpshufb %xmm14, %xmm11, %xmm6
+; AVX2-NEXT: vpshufb %xmm13, %xmm11, %xmm6
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm12 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
; AVX2-NEXT: vmovdqa 32(%rsi), %xmm6
; AVX2-NEXT: vmovdqa 32(%rdi), %xmm7
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero,xmm13[2],zero,zero,zero,xmm13[3],zero,zero,zero
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero
; AVX2-NEXT: vpblendw {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7,8],ymm12[9],ymm15[10,11,12],ymm12[13],ymm15[14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[0,0,2,1,4,4,6,5]
; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2],ymm10[3],ymm12[4],ymm10[5],ymm12[6],ymm10[7]
@@ -5849,7 +5868,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vpshufhw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,6,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm9, %ymm9
; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[2,3,2,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[2,3,2,3]
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero,xmm10[2],zero,zero,zero,xmm10[3],zero,zero,zero
; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2,3,4],ymm9[5],ymm10[6,7,8],ymm9[9],ymm10[10,11,12],ymm9[13],ymm10[14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7]
@@ -5866,13 +5885,13 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5,6,7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13,14,15]
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; AVX2-NEXT: vpshufb %xmm14, %xmm2, %xmm4
+; AVX2-NEXT: vpshufb %xmm13, %xmm2, %xmm4
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7]
-; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
@@ -5888,80 +5907,83 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7,8],ymm1[9],ymm2[10,11,12],ymm1[13],ymm2[14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
+; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
; AVX2-NEXT: vmovdqa 16(%r10), %xmm12
-; AVX2-NEXT: vmovdqa 16(%rax), %xmm10
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm10[8],xmm12[8],xmm10[9],xmm12[9],xmm10[10],xmm12[10],xmm10[11],xmm12[11],xmm10[12],xmm12[12],xmm10[13],xmm12[13],xmm10[14],xmm12[14],xmm10[15],xmm12[15]
+; AVX2-NEXT: vmovdqa 16(%rax), %xmm11
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,0,2,1,4,5,6,7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa 16(%r9), %xmm5
-; AVX2-NEXT: vmovdqa 16(%r8), %xmm4
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,1,1,3,4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,1,3,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT: vmovdqa 16(%r9), %xmm10
+; AVX2-NEXT: vmovdqa 16(%r8), %xmm6
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,1,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm14[2,1,3,3,4,5,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5,6,7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13,14,15]
; AVX2-NEXT: vmovdqa 16(%rcx), %xmm7
-; AVX2-NEXT: vmovdqa 16(%rdx), %xmm6
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
-; AVX2-NEXT: vpshufb %xmm14, %xmm15, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vmovdqa 16(%rsi), %xmm3
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm13 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7,8],ymm0[9],ymm13[10,11,12],ymm0[13],ymm13[14,15]
+; AVX2-NEXT: vmovdqa 16(%rdx), %xmm5
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,2,1,4,5,6,7]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT: vmovdqa 16(%rsi), %xmm4
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm15 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1],ymm15[2,3,4],ymm1[5],ymm15[6,7,8],ymm1[9],ymm15[10,11,12],ymm1[13],ymm15[14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,0,2,1,4,4,6,5]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm9[1],ymm0[2],ymm9[3],ymm0[4],ymm9[5],ymm0[6],ymm9[7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,4,4,6,5]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4],ymm9[5],ymm1[6],ymm9[7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,4,4,6,5]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm11[0,1,2,3,4,5,5,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,7,7]
-; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm8, %ymm8
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5],ymm8[6],ymm0[7],ymm8[8,9,10,11,12],ymm0[13],ymm8[14],ymm0[15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,4,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm8, %ymm8
-; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3,4],ymm8[5],ymm1[6,7,8],ymm8[9],ymm1[10,11,12],ymm8[13],ymm1[14,15]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7]
+; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm1, %ymm1
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm14[0,1,2,3,4,5,5,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,6,5,7,7]
+; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm8, %ymm8
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6],ymm1[7],ymm8[8,9,10,11,12],ymm1[13],ymm8[14],ymm1[15]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm8, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3,4],ymm0[5],ymm2[6,7,8],ymm0[9],ymm2[10,11,12],ymm0[13],ymm2[14,15]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7]
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm4[2,1,3,3,4,5,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm5, %ymm5
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4,5,6,7,8],ymm0[9],ymm5[10],ymm0[11],ymm5[12,13,14,15]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; AVX2-NEXT: vpshufb %xmm14, %xmm3, %xmm5
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7,8],ymm5[9],ymm6[10,11,12],ymm5[13],ymm6[14,15]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm2[0,1,1,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[2,1,3,3,4,5,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm6, %ymm6
+; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1],ymm6[2],ymm0[3],ymm6[4,5,6,7,8],ymm0[9],ymm6[10],ymm0[11],ymm6[12,13,14,15]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,2,2,3,4,5,6,7]
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm6 = xmm7[0],xmm6[0]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7,8],ymm4[9],ymm6[10,11,12],ymm4[13],ymm6[14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4],ymm0[5],ymm4[6],ymm0[7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5]
; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm5, %ymm1
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,5,7]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7]
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm4
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6],ymm1[7],ymm4[8,9,10,11,12],ymm1[13],ymm4[14],ymm1[15]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5]
-; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
-; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7]
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7,8],ymm3[9],ymm2[10,11,12],ymm3[13],ymm2[14,15]
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7]
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6],ymm1[7],ymm2[8,9,10,11,12],ymm1[13],ymm2[14],ymm1[15]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,4,6,5]
+; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,6,6,7]
+; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm2, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7,8],ymm2[9],ymm3[10,11,12],ymm2[13],ymm3[14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7]
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
@@ -5983,7 +6005,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
; AVX2-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
; AVX2-NEXT: # xmm6 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15]
-; AVX2-NEXT: vpshufb %xmm14, %xmm6, %xmm7
+; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[0,1,0,1,4,5,2,3,0,1,4,5,4,5,6,7]
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero
; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm10 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero
; AVX2-NEXT: vpblendw {{.*#+}} ymm7 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7,8],ymm7[9],ymm10[10,11,12],ymm7[13],ymm10[14,15]
@@ -6012,9 +6034,8 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vmovdqa %ymm0, 128(%rax)
; AVX2-NEXT: vmovdqa %ymm8, 224(%rax)
; AVX2-NEXT: vmovdqa %ymm9, 192(%rax)
+; AVX2-NEXT: vmovdqa %ymm13, 288(%rax)
; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
-; AVX2-NEXT: vmovaps %ymm0, 288(%rax)
-; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 256(%rax)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 352(%rax)
@@ -6032,7 +6053,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vmovaps %ymm0, (%rax)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
-; AVX2-NEXT: addq $328, %rsp # imm = 0x148
+; AVX2-NEXT: addq $296, %rsp # imm = 0x128
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-llrint.ll b/llvm/test/CodeGen/X86/vector-llrint.ll
index 7017eb60df41d..49b139fb6a9f1 100644
--- a/llvm/test/CodeGen/X86/vector-llrint.ll
+++ b/llvm/test/CodeGen/X86/vector-llrint.ll
@@ -543,11 +543,11 @@ define <4 x i64> @llrint_v4i64_v4f64(<4 x double> %x) {
; AVX512: # %bb.0:
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-NEXT: vcvtsd2si %xmm1, %rax
-; AVX512-NEXT: vmovq %rax, %xmm2
-; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512-NEXT: vcvtsd2si %xmm1, %rax
; AVX512-NEXT: vmovq %rax, %xmm1
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512-NEXT: vcvtsd2si %xmm2, %rax
+; AVX512-NEXT: vmovq %rax, %xmm2
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512-NEXT: vcvtsd2si %xmm0, %rax
; AVX512-NEXT: vmovq %rax, %xmm2
; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -648,12 +648,12 @@ define <8 x i64> @llrint_v8i64_v8f64(<8 x double> %x) {
; AVX512-NEXT: vmovq %rax, %xmm2
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512-NEXT: vcvtsd2si %xmm2, %rax
-; AVX512-NEXT: vmovq %rax, %xmm3
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vcvtsd2si %xmm2, %rax
; AVX512-NEXT: vmovq %rax, %xmm2
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX512-NEXT: vcvtsd2si %xmm3, %rax
+; AVX512-NEXT: vmovq %rax, %xmm3
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX512-NEXT: vcvtsd2si %xmm0, %rax
; AVX512-NEXT: vmovq %rax, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-lrint.ll b/llvm/test/CodeGen/X86/vector-lrint.ll
index b1c8d46f497f3..cef83c78ce7f8 100644
--- a/llvm/test/CodeGen/X86/vector-lrint.ll
+++ b/llvm/test/CodeGen/X86/vector-lrint.ll
@@ -353,11 +353,11 @@ define <4 x iXLen> @lrint_v4f64(<4 x double> %x) {
; AVX512-i64: # %bb.0:
; AVX512-i64-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX512-i64-NEXT: vcvtsd2si %xmm1, %rax
-; AVX512-i64-NEXT: vmovq %rax, %xmm2
-; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512-i64-NEXT: vcvtsd2si %xmm1, %rax
; AVX512-i64-NEXT: vmovq %rax, %xmm1
-; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX512-i64-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[3,2,2,3]
+; AVX512-i64-NEXT: vcvtsd2si %xmm2, %rax
+; AVX512-i64-NEXT: vmovq %rax, %xmm2
+; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax
; AVX512-i64-NEXT: vmovq %rax, %xmm2
; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -465,12 +465,12 @@ define <8 x iXLen> @lrint_v8f64(<8 x double> %x) {
; AVX512-i64-NEXT: vmovq %rax, %xmm2
; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX512-i64-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512-i64-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX512-i64-NEXT: vcvtsd2si %xmm2, %rax
-; AVX512-i64-NEXT: vmovq %rax, %xmm3
-; AVX512-i64-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-i64-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
; AVX512-i64-NEXT: vcvtsd2si %xmm2, %rax
; AVX512-i64-NEXT: vmovq %rax, %xmm2
+; AVX512-i64-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX512-i64-NEXT: vcvtsd2si %xmm3, %rax
+; AVX512-i64-NEXT: vmovq %rax, %xmm3
; AVX512-i64-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX512-i64-NEXT: vcvtsd2si %xmm0, %rax
; AVX512-i64-NEXT: vmovq %rax, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
index 606beeaff750e..fd1a6ad7c4abc 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
@@ -1122,26 +1122,50 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
; SSE-NEXT: addsd %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v4f64:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v4f64:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v4f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4f64:
; AVX512: # %bb.0:
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1166,24 +1190,62 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
; SSE-NEXT: addsd %xmm4, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v8f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v8f64:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm3, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v8f64:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm3, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v8f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX2-NEXT: vaddsd %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX2-NEXT: vaddsd %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8f64:
; AVX512: # %bb.0:
@@ -1192,7 +1254,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm1[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
@@ -1265,38 +1327,104 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v16f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm5, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm4[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm4, %xmm1
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v16f64:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm5, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vaddsd %xmm3, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm1
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vaddsd %xmm4, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm4[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm1
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v16f64:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm5, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm3, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm1
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm4, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm4[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm4, %xmm1
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v16f64:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
+; AVX2-NEXT: vaddsd %xmm5, %xmm0, %xmm0
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX2-NEXT: vaddsd %xmm5, %xmm0, %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vaddsd %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm1
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm4[1,0]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm1
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm4[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v16f64:
; AVX512: # %bb.0:
@@ -1305,7 +1433,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm3 = zmm1[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm3
; AVX512-NEXT: vaddsd %xmm3, %xmm0, %xmm0
@@ -1320,7 +1448,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm1 = zmm2[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm1
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm0
@@ -1410,9 +1538,9 @@ define double @test_v4f64_zero(<4 x double> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm1
-; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1421,9 +1549,9 @@ define double @test_v4f64_zero(<4 x double> %a0) {
; AVX512: # %bb.0:
; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1487,16 +1615,16 @@ define double @test_v8f64_zero(<8 x double> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm2
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vaddsd %xmm0, %xmm2, %xmm2
-; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vaddsd %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
; AVX2-NEXT: vaddsd %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1507,7 +1635,7 @@ define double @test_v8f64_zero(<8 x double> %a0) {
; AVX512-NEXT: vaddsd %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
@@ -1619,30 +1747,30 @@ define double @test_v16f64_zero(<16 x double> %a0) {
; AVX2: # %bb.0:
; AVX2-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm4
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vaddsd %xmm0, %xmm4, %xmm4
-; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-NEXT: vaddsd %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
; AVX2-NEXT: vaddsd %xmm0, %xmm4, %xmm0
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm0
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[3,2,2,3]
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vaddsd %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm1
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[3,2,2,3]
; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
@@ -1653,7 +1781,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm2
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm3 = zmm0[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
@@ -1668,7 +1796,7 @@ define double @test_v16f64_zero(<16 x double> %a0) {
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm1[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
@@ -1720,24 +1848,46 @@ define double @test_v4f64_undef(<4 x double> %a0) {
; SSE-NEXT: addsd %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v4f64_undef:
-; AVX: # %bb.0:
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vaddsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v4f64_undef:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v4f64_undef:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-FAST-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm1, %xmm1
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v4f64_undef:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX2-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX2-NEXT: vaddsd %xmm2, %xmm1, %xmm1
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v4f64_undef:
; AVX512: # %bb.0:
; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
; AVX512-NEXT: vaddsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1761,23 +1911,59 @@ define double @test_v8f64_undef(<8 x double> %a0) {
; SSE-NEXT: addsd %xmm3, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v8f64_undef:
-; AVX: # %bb.0:
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm2
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vaddsd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v8f64_undef:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm2, %xmm2
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm2, %xmm0
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v8f64_undef:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-FAST-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm2, %xmm2
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm2, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v8f64_undef:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX2-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vaddsd %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v8f64_undef:
; AVX512: # %bb.0:
@@ -1785,7 +1971,7 @@ define double @test_v8f64_undef(<8 x double> %a0) {
; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
; AVX512-NEXT: vaddsd %xmm2, %xmm1, %xmm1
@@ -1829,37 +2015,101 @@ define double @test_v16f64_undef(<16 x double> %a0) {
; SSE-NEXT: addsd %xmm7, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v16f64_undef:
-; AVX: # %bb.0:
-; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
-; AVX-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm4
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vaddsd %xmm0, %xmm4, %xmm0
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm4, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vaddsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vaddsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; AVX1-SLOW-LABEL: test_v16f64_undef:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm4, %xmm4
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm0, %xmm4, %xmm0
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm4, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vaddsd %xmm3, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm1
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-SLOW-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
+;
+; AVX1-FAST-LABEL: test_v16f64_undef:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX1-FAST-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm4, %xmm4
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm0, %xmm4, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm4, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vaddsd %xmm3, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm1
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX1-FAST-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-LABEL: test_v16f64_undef:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
+; AVX2-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
+; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX2-NEXT: vaddsd %xmm5, %xmm4, %xmm4
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm0, %xmm4, %xmm0
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX2-NEXT: vaddsd %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vaddsd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vaddsd %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm1
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[3,2,2,3]
+; AVX2-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
; AVX512-LABEL: test_v16f64_undef:
; AVX512: # %bb.0:
@@ -1867,7 +2117,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
; AVX512-NEXT: vaddsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm3 = zmm0[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; AVX512-NEXT: vaddsd %xmm3, %xmm2, %xmm2
@@ -1882,7 +2132,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm1[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
; AVX512-NEXT: vaddsd %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
index 7048b98227620..caecdfb225739 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll
@@ -725,37 +725,20 @@ define double @test_v4f64(<4 x double> %a0) {
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX-NEXT: vmaxsd %xmm0, %xmm3, %xmm4
-; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm4, %xmm0
-; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3
-; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1
-; AVX-NEXT: vmaxsd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512-LABEL: test_v4f64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,2,3]
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX512-NEXT: vmaxsd %xmm0, %xmm3, %xmm4
; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1}
; AVX512-NEXT: vcmpunordsd %xmm4, %xmm4, %k1
-; AVX512-NEXT: vmaxsd %xmm4, %xmm1, %xmm0
-; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512-NEXT: vmaxsd %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vmaxsd %xmm4, %xmm2, %xmm0
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %a0)
@@ -820,45 +803,25 @@ define double @test_v8f64(<8 x double> %a0) {
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v8f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmaxpd %ymm0, %ymm1, %ymm2
-; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3
-; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512BW-LABEL: test_v8f64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2
; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; AVX512BW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX512BW-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512BW-NEXT: vpermpd {{.*#+}} zmm5 = zmm0[3,2,2,3,7,6,6,7]
+; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX512BW-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0]
; AVX512BW-NEXT: vmaxsd %xmm0, %xmm7, %xmm8
; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512BW-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1}
; AVX512BW-NEXT: vcmpunordsd %xmm8, %xmm8, %k1
-; AVX512BW-NEXT: vmaxsd %xmm8, %xmm5, %xmm0
-; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512BW-NEXT: vmaxsd %xmm0, %xmm6, %xmm0
+; AVX512BW-NEXT: vmaxsd %xmm8, %xmm6, %xmm0
; AVX512BW-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512BW-NEXT: vmaxsd %xmm0, %xmm5, %xmm0
+; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512BW-NEXT: vmaxsd %xmm0, %xmm3, %xmm0
; AVX512BW-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
@@ -879,19 +842,19 @@ define double @test_v8f64(<8 x double> %a0) {
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm0[3,2,2,3,7,6,6,7]
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0]
; AVX512VL-NEXT: vmaxsd %xmm0, %xmm7, %xmm8
; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1}
; AVX512VL-NEXT: vcmpunordsd %xmm8, %xmm8, %k1
-; AVX512VL-NEXT: vmaxsd %xmm8, %xmm5, %xmm0
-; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vmaxsd %xmm0, %xmm6, %xmm0
+; AVX512VL-NEXT: vmaxsd %xmm8, %xmm6, %xmm0
; AVX512VL-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vmaxsd %xmm0, %xmm5, %xmm0
+; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmaxsd %xmm0, %xmm3, %xmm0
; AVX512VL-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
@@ -1011,32 +974,6 @@ define double @test_v16f64(<16 x double> %a0) {
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v16f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmaxpd %ymm0, %ymm2, %ymm4
-; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT: vblendvpd %ymm0, %ymm2, %ymm4, %ymm0
-; AVX-NEXT: vmaxpd %ymm1, %ymm3, %ymm2
-; AVX-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm1
-; AVX-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1
-; AVX-NEXT: vmaxpd %ymm0, %ymm1, %ymm2
-; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vmaxsd %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3
-; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512-LABEL: test_v16f64:
; AVX512: # %bb.0:
; AVX512-NEXT: vmaxpd %zmm0, %zmm1, %zmm2
@@ -1051,7 +988,7 @@ define double @test_v16f64(<16 x double> %a0) {
; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm0 = zmm2[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
index 727af12217c67..c0d44118587eb 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll
@@ -729,37 +729,20 @@ define double @test_v4f64(<4 x double> %a0) {
; SSE41-NEXT: movapd %xmm1, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX-NEXT: vminsd %xmm0, %xmm3, %xmm4
-; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vblendvpd %xmm0, %xmm3, %xmm4, %xmm0
-; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3
-; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1
-; AVX-NEXT: vminsd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512-LABEL: test_v4f64:
; AVX512: # %bb.0:
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[3,2,2,3]
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0]
; AVX512-NEXT: vminsd %xmm0, %xmm3, %xmm4
; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512-NEXT: vmovsd %xmm3, %xmm4, %xmm4 {%k1}
; AVX512-NEXT: vcmpunordsd %xmm4, %xmm4, %k1
-; AVX512-NEXT: vminsd %xmm4, %xmm1, %xmm0
-; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
-; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512-NEXT: vminsd %xmm0, %xmm2, %xmm0
+; AVX512-NEXT: vminsd %xmm4, %xmm2, %xmm0
; AVX512-NEXT: vmovsd %xmm2, %xmm0, %xmm0 {%k1}
+; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512-NEXT: vminsd %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1}
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %a0)
@@ -824,45 +807,25 @@ define double @test_v8f64(<8 x double> %a0) {
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v8f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vminpd %ymm0, %ymm1, %ymm2
-; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3
-; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512BW-LABEL: test_v8f64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm2
; AVX512BW-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; AVX512BW-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX512BW-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512BW-NEXT: vpermpd {{.*#+}} zmm5 = zmm0[3,2,2,3,7,6,6,7]
+; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX512BW-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0]
; AVX512BW-NEXT: vminsd %xmm0, %xmm7, %xmm8
; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512BW-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1}
; AVX512BW-NEXT: vcmpunordsd %xmm8, %xmm8, %k1
-; AVX512BW-NEXT: vminsd %xmm8, %xmm5, %xmm0
-; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512BW-NEXT: vminsd %xmm0, %xmm6, %xmm0
+; AVX512BW-NEXT: vminsd %xmm8, %xmm6, %xmm0
; AVX512BW-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512BW-NEXT: vminsd %xmm0, %xmm5, %xmm0
+; AVX512BW-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512BW-NEXT: vminsd %xmm0, %xmm3, %xmm0
; AVX512BW-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1}
; AVX512BW-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
@@ -883,19 +846,19 @@ define double @test_v8f64(<8 x double> %a0) {
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm4 = xmm3[1,0]
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX512VL-NEXT: vshufpd {{.*#+}} xmm6 = xmm5[1,0]
+; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm0[3,2,2,3,7,6,6,7]
+; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm6
; AVX512VL-NEXT: vshufpd {{.*#+}} xmm7 = xmm0[1,0]
; AVX512VL-NEXT: vminsd %xmm0, %xmm7, %xmm8
; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmovsd %xmm7, %xmm8, %xmm8 {%k1}
; AVX512VL-NEXT: vcmpunordsd %xmm8, %xmm8, %k1
-; AVX512VL-NEXT: vminsd %xmm8, %xmm5, %xmm0
-; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1}
-; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
-; AVX512VL-NEXT: vminsd %xmm0, %xmm6, %xmm0
+; AVX512VL-NEXT: vminsd %xmm8, %xmm6, %xmm0
; AVX512VL-NEXT: vmovsd %xmm6, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
+; AVX512VL-NEXT: vminsd %xmm0, %xmm5, %xmm0
+; AVX512VL-NEXT: vmovsd %xmm5, %xmm0, %xmm0 {%k1}
+; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vminsd %xmm0, %xmm3, %xmm0
; AVX512VL-NEXT: vmovsd %xmm3, %xmm0, %xmm0 {%k1}
; AVX512VL-NEXT: vcmpunordsd %xmm0, %xmm0, %k1
@@ -1015,32 +978,6 @@ define double @test_v16f64(<16 x double> %a0) {
; SSE41-NEXT: movapd %xmm2, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v16f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vminpd %ymm0, %ymm2, %ymm4
-; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT: vblendvpd %ymm0, %ymm2, %ymm4, %ymm0
-; AVX-NEXT: vminpd %ymm1, %ymm3, %ymm2
-; AVX-NEXT: vcmpunordpd %ymm1, %ymm1, %ymm1
-; AVX-NEXT: vblendvpd %ymm1, %ymm3, %ymm2, %ymm1
-; AVX-NEXT: vminpd %ymm0, %ymm1, %ymm2
-; AVX-NEXT: vcmpunordpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vminsd %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm3
-; AVX-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vminsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512-LABEL: test_v16f64:
; AVX512: # %bb.0:
; AVX512-NEXT: vminpd %zmm0, %zmm1, %zmm2
@@ -1055,7 +992,7 @@ define double @test_v16f64(<16 x double> %a0) {
; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm0 = zmm2[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vminsd %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1}
; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
index 17fb2ed5d13a8..a8644e368c382 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll
@@ -987,26 +987,14 @@ define double @test_v4f64(double %a0, <4 x double> %a1) {
; SSE-NEXT: mulsd %xmm2, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512-LABEL: test_v4f64:
; AVX512: # %bb.0:
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,2,3]
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1031,25 +1019,6 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
; SSE-NEXT: mulsd %xmm4, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v8f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm3 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512-LABEL: test_v8f64:
; AVX512: # %bb.0:
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
@@ -1057,7 +1026,7 @@ define double @test_v8f64(double %a0, <8 x double> %a1) {
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm1[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
@@ -1130,39 +1099,6 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
; SSE41-NEXT: mulsd {{[0-9]+}}(%rsp), %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v16f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm5, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm4[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm4, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512-LABEL: test_v16f64:
; AVX512: # %bb.0:
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
@@ -1170,7 +1106,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm3
; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0
-; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm3 = zmm1[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm3
; AVX512-NEXT: vmulsd %xmm3, %xmm0, %xmm0
@@ -1185,7 +1121,7 @@ define double @test_v16f64(double %a0, <16 x double> %a1) {
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm1 = zmm2[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $2, %zmm2, %xmm1
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm0
@@ -1239,24 +1175,13 @@ define double @test_v4f64_one(<4 x double> %a0) {
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v4f64_one:
-; AVX: # %bb.0:
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512-LABEL: test_v4f64_one:
; AVX512: # %bb.0:
; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm1
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1281,31 +1206,13 @@ define double @test_v8f64_one(<8 x double> %a0) {
; SSE-NEXT: mulsd %xmm3, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v8f64_one:
-; AVX: # %bb.0:
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm2
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm2
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512-LABEL: test_v8f64_one:
; AVX512: # %bb.0:
; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulsd %xmm1, %xmm0, %xmm1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
@@ -1350,45 +1257,13 @@ define double @test_v16f64_one(<16 x double> %a0) {
; SSE-NEXT: mulsd %xmm7, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v16f64_one:
-; AVX: # %bb.0:
-; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
-; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm4
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm4
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm0
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512-LABEL: test_v16f64_one:
; AVX512: # %bb.0:
; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm2
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm3 = zmm0[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
@@ -1403,7 +1278,7 @@ define double @test_v16f64_one(<16 x double> %a0) {
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm1[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
@@ -1455,24 +1330,13 @@ define double @test_v4f64_undef(<4 x double> %a0) {
; SSE-NEXT: mulsd %xmm1, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v4f64_undef:
-; AVX: # %bb.0:
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vmulsd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512-LABEL: test_v4f64_undef:
; AVX512: # %bb.0:
; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
+; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,2,2,3]
; AVX512-NEXT: vmulsd %xmm0, %xmm1, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -1496,31 +1360,13 @@ define double @test_v8f64_undef(<8 x double> %a0) {
; SSE-NEXT: mulsd %xmm3, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v8f64_undef:
-; AVX: # %bb.0:
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm2
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vmulsd %xmm0, %xmm2, %xmm0
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512-LABEL: test_v8f64_undef:
; AVX512: # %bb.0:
; AVX512-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
; AVX512-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm0[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2
; AVX512-NEXT: vmulsd %xmm2, %xmm1, %xmm1
@@ -1564,45 +1410,13 @@ define double @test_v16f64_undef(<16 x double> %a0) {
; SSE-NEXT: mulsd %xmm7, %xmm0
; SSE-NEXT: retq
;
-; AVX-LABEL: test_v16f64_undef:
-; AVX: # %bb.0:
-; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
-; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm4
-; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX-NEXT: vmulsd %xmm0, %xmm4, %xmm0
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm4, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmulsd %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmulsd %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512-LABEL: test_v16f64_undef:
; AVX512: # %bb.0:
; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX512-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
-; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm3[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm3 = zmm0[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm3
; AVX512-NEXT: vmulsd %xmm3, %xmm2, %xmm2
@@ -1617,7 +1431,7 @@ define double @test_v16f64_undef(<16 x double> %a0) {
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm2[1,0]
+; AVX512-NEXT: vpermpd {{.*#+}} zmm2 = zmm1[3,2,2,3,7,6,6,7]
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2
; AVX512-NEXT: vmulsd %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll
index 16493c0448848..ca108ca732bbb 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-unpck.ll
@@ -50,12 +50,12 @@ define <2 x double> @unpckh_unary_extracted_v8f64(<4 x double> %x) {
; vpermps requires a constant load for the index op. It's unlikely to be profitable.
define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) {
-; ALL-LABEL: unpckh_unary_extracted_v8i32:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
+; AVX1-LABEL: unpckh_unary_extracted_v8i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
%extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -63,12 +63,12 @@ define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) {
}
define <4 x float> @unpckh_unary_extracted_v8f32(<8 x float> %x) {
-; ALL-LABEL: unpckh_unary_extracted_v8f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
+; AVX1-LABEL: unpckh_unary_extracted_v8f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
%extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@@ -158,12 +158,12 @@ define <2 x double> @unpckl_unary_extracted_v8f64(<4 x double> %x) {
; vpermps requires a constant load for the index op. It's unlikely to be profitable.
define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) {
-; ALL-LABEL: unpckl_unary_extracted_v8i32:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
+; AVX1-LABEL: unpckl_unary_extracted_v8i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
%extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -171,12 +171,12 @@ define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) {
}
define <4 x float> @unpckl_unary_extracted_v8f32(<8 x float> %x) {
-; ALL-LABEL: unpckl_unary_extracted_v8f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
+; AVX1-LABEL: unpckl_unary_extracted_v8f32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
%extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@@ -226,12 +226,14 @@ define <16 x i8> @unpckl_unary_extracted_v32i8(<32 x i8> %x) {
; This would infinite loop because we did not recognize the unpack shuffle mask in commuted form.
define <8 x i32> @extract_unpckl_v8i32(<8 x i32> %a) {
-; ALL-LABEL: extract_unpckl_v8i32:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; ALL-NEXT: retq
+; AVX1-LABEL: extract_unpckl_v8i32:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX1-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
ret <8 x i32> %shuffle
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ALL: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index dbbfaab9ea26a..aefbad0fb296b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -1603,9 +1603,9 @@ define <16 x i16> @shuffle_v16i16_16_16_16_16_04_05_06_07_24_24_24_24_12_13_14_1
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,0,1,0,1,0,1,24,25,26,27,28,29,30,31]
-; XOPAVX1-NEXT: vpperm %xmm4, %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpperm %xmm4, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7]
+; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1],xmm0[8,9,10,11,12,13,14,15]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -1655,10 +1655,11 @@ define <16 x i16> @shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_1
; XOPAVX1-LABEL: shuffle_v16i16_19_18_17_16_07_06_05_04_27_26_25_24_15_14_13_12:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,4,5,2,3,0,1,30,31,28,29,26,27,24,25]
-; XOPAVX1-NEXT: vpperm %xmm4, %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpperm %xmm4, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
+; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm1[6,7,4,5,2,3,0,1],xmm0[14,15,12,13,10,11,8,9]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -5383,8 +5384,10 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3
; XOPAVX1-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,1],xmm3[2,3],xmm2[2,3],xmm3[12,13],xmm2[12,13],xmm3[14,15],xmm2[14,15]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,3,2,3]
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
+; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[0,1,2,3,12,13],xmm2[14,15],xmm1[u,u,u,u,u,u,u,u]
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -5454,8 +5457,10 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2
; XOPAVX1-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1],xmm2[8,9],xmm3[2,3],xmm2[10,11],xmm3[12,13],xmm2[0,1],xmm3[14,15],xmm2[2,3]
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,0,2,3]
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
+; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[8,9,10,11,0,1],xmm2[2,3],xmm1[u,u,u,u,u,u,u,u]
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -5671,8 +5676,10 @@ define <16 x i16> @shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_3
; XOPAVX1-LABEL: shuffle_v16i16_00_02_01_03_20_22_21_31_08_10_09_11_28_30_29_31:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1,4,5,2,3,6,7],xmm2[8,9,12,13,10,11,14,15]
+; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,6,5,7]
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,1,3,4,5,6,7]
+; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[8,9,12,13,10,11],xmm2[14,15],xmm1[u,u,u,u,u,u,u,u]
; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7]
; XOPAVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -6122,10 +6129,11 @@ define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_u
; XOPAVX1-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,0,1,4,5,8,9,4,5]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8,9,10,11,8,9,10,11,16,17,20,21,24,25,20,21]
-; XOPAVX1-NEXT: vpperm %xmm4, %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpperm %xmm4, %xmm1, %xmm0, %xmm0
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3]
+; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[8,9,10,11,8,9,10,11],xmm1[0,1,4,5,8,9,4,5]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -6729,10 +6737,11 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_u
; XOPAVX1-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
; XOPAVX1: # %bb.0:
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; XOPAVX1-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [14,15,10,11,22,23,24,25,8,9,8,9,26,27,28,29]
-; XOPAVX1-NEXT: vpperm %xmm4, %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpperm %xmm4, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,4]
+; XOPAVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm1[14,15,10,11],xmm0[6,7,8,9],xmm1[8,9,8,9],xmm0[10,11,12,13]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -6945,18 +6954,11 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a,
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
; AVX2-FAST-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
-; AVX512VL-FAST: # %bb.0:
-; AVX512VL-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
-; AVX512VL-FAST-NEXT: retq
+; AVX512VL-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9]
+; AVX512VL-NEXT: vpermw %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u:
; XOPAVX1: # %bb.0:
@@ -7680,8 +7682,8 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) {
; AVX2-SLOW-LABEL: PR34369:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,1]
+; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,2,3]
; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6]
; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
@@ -7690,17 +7692,30 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) {
; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0
; AVX2-SLOW-NEXT: retq
;
-; AVX2-FAST-LABEL: PR34369:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,10,11,u,u,u,u,u,u,4,5]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6],xmm2[7]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: retq
+; AVX2-FAST-ALL-LABEL: PR34369:
+; AVX2-FAST-ALL: # %bb.0:
+; AVX2-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,6,0,5]
+; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm2, %ymm2
+; AVX2-FAST-ALL-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,6,6,6]
+; AVX2-FAST-ALL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
+; AVX2-FAST-ALL-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6],xmm2[7]
+; AVX2-FAST-ALL-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-FAST-ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-ALL-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
+; AVX2-FAST-ALL-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT: retq
+;
+; AVX2-FAST-PERLANE-LABEL: PR34369:
+; AVX2-FAST-PERLANE: # %bb.0:
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
+; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,2,2,3]
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,u,u,u,u,u,u,12,13]
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3],xmm2[4,5,6],xmm0[7]
+; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
+; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-FAST-PERLANE-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-FAST-PERLANE-NEXT: retq
;
; AVX512VL-LABEL: PR34369:
; AVX512VL: # %bb.0:
@@ -7725,9 +7740,9 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) {
;
; XOPAVX2-LABEL: PR34369:
; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
+; XOPAVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[3,2,2,3]
; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
-; XOPAVX2-NEXT: vpperm {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm2[10,11],xmm0[8,9,10,11,12,13],xmm2[4,5]
+; XOPAVX2-NEXT: vpperm {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm2[2,3],xmm0[8,9,10,11,12,13],xmm2[12,13]
; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; XOPAVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index d8ee8103cee50..4afbda020b528 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -2541,11 +2541,11 @@ define <32 x i8> @shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_
;
; XOPAVX1-LABEL: shuffle_v32i8_32_32_32_32_32_32_32_32_08_09_10_11_12_13_14_15_48_48_48_48_48_48_48_48_24_25_26_27_28_29_30_31:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,0,0,0,0,0,0,0,24,25,26,27,28,29,30,31]
-; XOPAVX1-NEXT: vpperm %xmm4, %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpperm %xmm4, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm2 = xmm2[0,0,0,0,0,0,0,0],xmm3[0,1,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm1[0,0,0,0,0,0,0,0],xmm0[8,9,10,11,12,13,14,15]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -2596,11 +2596,11 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_
;
; XOPAVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,31,30,29,28,27,26,25,24]
-; XOPAVX1-NEXT: vpperm %xmm4, %xmm2, %xmm3, %xmm2
-; XOPAVX1-NEXT: vpperm %xmm4, %xmm0, %xmm1, %xmm0
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[15,14,13,12,11,10,9,8,u,u,u,u,u,u,u,u]
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm2 = xmm2[7,6,5,4,3,2,1,0],xmm3[0,1,2,3,4,5,6,7]
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm1[7,6,5,4,3,2,1,0],xmm0[15,14,13,12,11,10,9,8]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
@@ -4687,11 +4687,23 @@ define <32 x i8> @shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
-; AVX2OR512VL: # %bb.0:
-; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2OR512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX2-NEXT: retq
+;
+; AVX512VLBW-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX512VLBW: # %bb.0:
+; AVX512VLBW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VLBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6]
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
+; AVX512VLVBMI: # %bb.0:
+; AVX512VLVBMI-NEXT: vpbroadcastb {{.*#+}} xmm1 = [22,22,22,22,22,22,22,22,22,22,22,22,22,22,22,22]
+; AVX512VLVBMI-NEXT: vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v32i8_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_22_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu_uu:
; XOPAVX1: # %bb.0:
@@ -5145,12 +5157,14 @@ define <4 x i64> @PR28136(<32 x i8> %a0, <32 x i8> %a1) {
;
; XOPAVX1-LABEL: PR28136:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[10],xmm1[10],xmm0[12],xmm1[12],xmm0[14],xmm1[14],xmm0[9],xmm1[9],xmm0[11],xmm1[11],xmm0[13],xmm1[13],xmm0[15],xmm1[15]
-; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2],xmm1[2],xmm0[4],xmm1[4],xmm0[6],xmm1[6],xmm0[1],xmm1[1],xmm0[3],xmm1[3],xmm0[5],xmm1[5],xmm0[7],xmm1[7]
+; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[u,8,u,10,u,12,u,14,u,9,u,11,u,13,u,15]
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm2 = xmm0[8],xmm2[1],xmm0[10],xmm2[3],xmm0[12],xmm2[5],xmm0[14],xmm2[7],xmm0[9],xmm2[9],xmm0[11],xmm2[11],xmm0[13],xmm2[13],xmm0[15],xmm2[15]
+; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,0,u,2,u,4,u,6,u,1,u,3,u,5,u,7]
+; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7],xmm0[1],xmm1[9],xmm0[3],xmm1[11],xmm0[5],xmm1[13],xmm0[7],xmm1[15]
; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; XOPAVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index fb8618be17f06..ec49be2cc761b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -2348,21 +2348,57 @@ define <8 x i32> @shuffle_v8i32_0zzzzzzz_pgso(<8 x i32> %a) !prof !14 {
}
define <4 x i64> @unpckh_v4i64(<4 x i64> %x, <4 x i64> %y) {
-; ALL-LABEL: unpckh_v4i64:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: unpckh_v4i64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1OR2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: unpckh_v4i64:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: unpckh_v4i64:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,7]
+; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: unpckh_v4i64:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%unpckh = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> <i32 1, i32 7, i32 poison, i32 poison>
ret <4 x i64> %unpckh
}
define <4 x double> @unpckh_v4f64(<4 x double> %x, <4 x double> %y) {
-; ALL-LABEL: unpckh_v4f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: unpckh_v4f64:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1OR2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: unpckh_v4f64:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: unpckh_v4f64:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [1,7]
+; AVX512VL-FAST-ALL-NEXT: vpermt2pd %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: unpckh_v4f64:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%unpckh = shufflevector <4 x double> %x, <4 x double> %y, <4 x i32> <i32 1, i32 7, i32 poison, i32 poison>
ret <4 x double> %unpckh
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index bd78dbded0705..ecb618f126a2a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -1592,11 +1592,47 @@ define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) {
}
define <8 x float> @shuffle_v8f32_5555uuuu(<8 x float> %a, <8 x float> %b) {
-; ALL-LABEL: shuffle_v8f32_5555uuuu:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8f32_5555uuuu:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8f32_5555uuuu:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-ALL-LABEL: shuffle_v8f32_5555uuuu:
+; AVX2-FAST-ALL: # %bb.0:
+; AVX2-FAST-ALL-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
+; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT: retq
+;
+; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_5555uuuu:
+; AVX2-FAST-PERLANE: # %bb.0:
+; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8f32_5555uuuu:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_5555uuuu:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
+; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_5555uuuu:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 poison, i32 poison, i32 poison, i32 poison>
ret <8 x float> %shuffle
}
@@ -3389,11 +3425,47 @@ define <8 x i32> @shuffle_v8i32_44444444_bc(<8 x float> %a, <8 x float> %b) {
}
define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) {
-; ALL-LABEL: shuffle_v8i32_5555uuuu:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0
-; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8i32_5555uuuu:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: shuffle_v8i32_5555uuuu:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-ALL-LABEL: shuffle_v8i32_5555uuuu:
+; AVX2-FAST-ALL: # %bb.0:
+; AVX2-FAST-ALL-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
+; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT: retq
+;
+; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_5555uuuu:
+; AVX2-FAST-PERLANE: # %bb.0:
+; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX2-FAST-PERLANE-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: shuffle_v8i32_5555uuuu:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_5555uuuu:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
+; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_5555uuuu:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 poison, i32 poison, i32 poison, i32 poison>
ret <8 x i32> %shuffle
}
@@ -3859,32 +3931,41 @@ define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-LABEL: broadcast_concat_crash:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
-; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-NEXT: retq
+; AVX2-SLOW-LABEL: broadcast_concat_crash:
+; AVX2-SLOW: # %bb.0: # %entry
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-SLOW-NEXT: retq
;
-; AVX512VL-SLOW-LABEL: broadcast_concat_crash:
-; AVX512VL-SLOW: # %bb.0: # %entry
-; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
-; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX512VL-SLOW-NEXT: retq
+; AVX2-FAST-ALL-LABEL: broadcast_concat_crash:
+; AVX2-FAST-ALL: # %bb.0: # %entry
+; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
+; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT: retq
;
-; AVX512VL-FAST-LABEL: broadcast_concat_crash:
-; AVX512VL-FAST: # %bb.0: # %entry
-; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
-; AVX512VL-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [1,4,3,3]
-; AVX512VL-FAST-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1
-; AVX512VL-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-FAST-NEXT: retq
+; AVX2-FAST-PERLANE-LABEL: broadcast_concat_crash:
+; AVX2-FAST-PERLANE: # %bb.0: # %entry
+; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
+; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT: retq
+;
+; AVX512VL-LABEL: broadcast_concat_crash:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [7,8,7,7]
+; AVX512VL-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: retq
entry:
%tmp = shufflevector <4 x float> %x, <4 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%bc = bitcast <8 x float> %tmp to <4 x i64>
@@ -3899,11 +3980,29 @@ entry:
; PR40434: https://bugs.llvm.org/show_bug.cgi?id=40434
define <8 x i32> @unpckh_v8i32(<8 x i32> %x, <8 x i32> %y) {
-; ALL-LABEL: unpckh_v8i32:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: unpckh_v8i32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1OR2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: unpckh_v8i32:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: unpckh_v8i32:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,14,3,15]
+; AVX512VL-FAST-ALL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: unpckh_v8i32:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%unpckh = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 2, i32 14, i32 3, i32 15, i32 poison, i32 poison, i32 poison, i32 poison>
ret <8 x i32> %unpckh
}
@@ -3911,11 +4010,29 @@ define <8 x i32> @unpckh_v8i32(<8 x i32> %x, <8 x i32> %y) {
; Same as above but with floats.
define <8 x float> @unpckh_v8f32(<8 x float> %x, <8 x float> %y) {
-; ALL-LABEL: unpckh_v8f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: unpckh_v8f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1OR2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: unpckh_v8f32:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: unpckh_v8f32:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,14,3,15]
+; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: unpckh_v8f32:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%unpckh = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 2, i32 14, i32 3, i32 15, i32 poison, i32 poison, i32 poison, i32 poison>
ret <8 x float> %unpckh
}
@@ -3923,11 +4040,29 @@ define <8 x float> @unpckh_v8f32(<8 x float> %x, <8 x float> %y) {
; Alternate form of the above - make sure we don't have conflicting transforms.
define <8 x i32> @blend_perm_v8i32(<8 x i32> %x, <8 x i32> %y) {
-; ALL-LABEL: blend_perm_v8i32:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: blend_perm_v8i32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1OR2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: blend_perm_v8i32:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: blend_perm_v8i32:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,14,3,15]
+; AVX512VL-FAST-ALL-NEXT: vpermt2d %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: blend_perm_v8i32:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%unpckh = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
%r = shufflevector <8 x i32> %unpckh, <8 x i32> poison, <8 x i32> <i32 2, i32 6, i32 3, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
ret <8 x i32> %r
@@ -3936,11 +4071,29 @@ define <8 x i32> @blend_perm_v8i32(<8 x i32> %x, <8 x i32> %y) {
; Same as above but with floats.
define <8 x float> @blend_perm_v8f32(<8 x float> %x, <8 x float> %y) {
-; ALL-LABEL: blend_perm_v8f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1
-; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: blend_perm_v8f32:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX1OR2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: blend_perm_v8f32:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: blend_perm_v8f32:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,14,3,15]
+; AVX512VL-FAST-ALL-NEXT: vpermt2ps %ymm1, %ymm2, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: blend_perm_v8f32:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%unpckh = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 14, i32 15>
%r = shufflevector <8 x float> %unpckh, <8 x float> poison, <8 x i32> <i32 2, i32 6, i32 3, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
ret <8 x float> %r
@@ -3949,11 +4102,47 @@ define <8 x float> @blend_perm_v8f32(<8 x float> %x, <8 x float> %y) {
; Another variation of the above - make sure we don't have conflicting transforms.
define <8 x i32> @unpckh_v8i32_unary(<8 x i32> %x) {
-; ALL-LABEL: unpckh_v8i32_unary:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT: retq
+; AVX1-LABEL: unpckh_v8i32_unary:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: unpckh_v8i32_unary:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-ALL-LABEL: unpckh_v8i32_unary:
+; AVX2-FAST-ALL: # %bb.0:
+; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,3,7]
+; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT: retq
+;
+; AVX2-FAST-PERLANE-LABEL: unpckh_v8i32_unary:
+; AVX2-FAST-PERLANE: # %bb.0:
+; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-FAST-PERLANE-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: unpckh_v8i32_unary:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: unpckh_v8i32_unary:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,6,3,7]
+; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: unpckh_v8i32_unary:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%r = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 2, i32 6, i32 3, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
ret <8 x i32> %r
}
@@ -3961,11 +4150,47 @@ define <8 x i32> @unpckh_v8i32_unary(<8 x i32> %x) {
; Same as above but with floats.
define <8 x float> @unpckh_v8f32_unary(<8 x float> %x) {
-; ALL-LABEL: unpckh_v8f32_unary:
-; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; ALL-NEXT: retq
+; AVX1-LABEL: unpckh_v8f32_unary:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT: retq
+;
+; AVX2-SLOW-LABEL: unpckh_v8f32_unary:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-ALL-LABEL: unpckh_v8f32_unary:
+; AVX2-FAST-ALL: # %bb.0:
+; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm1 = [2,6,3,7]
+; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT: retq
+;
+; AVX2-FAST-PERLANE-LABEL: unpckh_v8f32_unary:
+; AVX2-FAST-PERLANE: # %bb.0:
+; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-FAST-PERLANE-NEXT: retq
+;
+; AVX512VL-SLOW-LABEL: unpckh_v8f32_unary:
+; AVX512VL-SLOW: # %bb.0:
+; AVX512VL-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-SLOW-NEXT: retq
+;
+; AVX512VL-FAST-ALL-LABEL: unpckh_v8f32_unary:
+; AVX512VL-FAST-ALL: # %bb.0:
+; AVX512VL-FAST-ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,6,3,7]
+; AVX512VL-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
+; AVX512VL-FAST-ALL-NEXT: retq
+;
+; AVX512VL-FAST-PERLANE-LABEL: unpckh_v8f32_unary:
+; AVX512VL-FAST-PERLANE: # %bb.0:
+; AVX512VL-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX512VL-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX512VL-FAST-PERLANE-NEXT: retq
%r = shufflevector <8 x float> %x, <8 x float> poison, <8 x i32> <i32 2, i32 6, i32 3, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
ret <8 x float> %r
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index b1efb416014b0..114a6ce5aeeae 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -412,40 +412,25 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
;FIXME: can do better with vpcompress
define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
-; SLOW-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
-; SLOW: # %bb.0:
-; SLOW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
-; SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
-; SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; SLOW-NEXT: retq
-;
-; FAST-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
-; FAST: # %bb.0:
-; FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
-; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; FAST-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; FAST-NEXT: retq
+; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
+; ALL: # %bb.0:
+; ALL-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,5,7,9,11,13,15]
+; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; ALL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; ALL-NEXT: retq
%res = shufflevector <16 x i32> %v, <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
ret <8 x i32> %res
}
;FIXME: can do better with vpcompress
define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) {
-; SLOW-LABEL: test_v16i32_0_1_2_12:
-; SLOW: # %bb.0:
-; SLOW-NEXT: vextractf32x4 $3, %zmm0, %xmm1
-; SLOW-NEXT: vbroadcastss %xmm1, %xmm1
-; SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
-; SLOW-NEXT: vzeroupper
-; SLOW-NEXT: retq
-;
-; FAST-LABEL: test_v16i32_0_1_2_12:
-; FAST: # %bb.0:
-; FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,1,2,12]
-; FAST-NEXT: vpermps %zmm0, %zmm1, %zmm0
-; FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; FAST-NEXT: vzeroupper
-; FAST-NEXT: retq
+; ALL-LABEL: test_v16i32_0_1_2_12:
+; ALL: # %bb.0:
+; ALL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,1,2,12]
+; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; ALL-NEXT: vzeroupper
+; ALL-NEXT: retq
%res = shufflevector <16 x i32> %v, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 12>
ret <4 x i32> %res
}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 87c135ddcec95..bd68817aa4a46 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -466,8 +466,7 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
; AVX512F-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,4,5,6,7]
; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
@@ -484,8 +483,7 @@ define <64 x i8> @shuffle_v64i8_63_zz_61_zz_59_zz_57_zz_55_zz_53_zz_51_zz_49_zz_
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128,15,128,13,128,11,128,9,128,7,128,5,128,3,128,1,128]
; AVX512DQ-NEXT: # ymm2 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,4,5,6,7]
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
@@ -509,8 +507,7 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_
; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
; AVX512F-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512F-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,4,5,6,7]
; AVX512F-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -533,8 +530,7 @@ define <64 x i8> @shuffle_v64i8_63_64_61_66_59_68_57_70_55_72_53_74_51_76_49_78_
; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14,15,0,13,2,11,4,9,6,7,8,5,10,3,12,1,14]
; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1]
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5,4,5,6,7]
; AVX512DQ-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
@@ -1048,10 +1044,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u]
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512F-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm2
+; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7]
+; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
@@ -1065,10 +1061,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
; AVX512BW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
@@ -1082,10 +1078,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7]
-; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
+; AVX512DQ-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm2
+; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
+; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4],xmm1[5,6,7]
+; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
@@ -1101,11 +1097,10 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
define <64 x i8> @shuffle_v64i8_61_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-LABEL: shuffle_v64i8_61_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[0,1]
-; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11,12],ymm3[29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27,28]
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[0,1]
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,4,5,4,5,6,7]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12],ymm2[29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm1[6,7],zmm0[0,1,2,3,4,5]
; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],ymm1[29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
@@ -1118,11 +1113,10 @@ define <64 x i8> @shuffle_v64i8_61_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_
;
; AVX512DQ-LABEL: shuffle_v64i8_61_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[0,1]
-; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11,12],ymm3[29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27,28]
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[0,1]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,4,5,4,5,6,7]
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12],ymm2[29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28]
+; AVX512DQ-NEXT: valignq {{.*#+}} zmm1 = zmm1[6,7],zmm0[0,1,2,3,4,5]
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],ymm1[29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
@@ -1140,11 +1134,10 @@ define <64 x i8> @shuffle_v64i8_61_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_
define <64 x i8> @shuffle_v64i8_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124_125(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-LABEL: shuffle_v64i8_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124_125:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[0,1]
-; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm3[30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[0,1]
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,4,5,4,5,6,7]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm2[30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm1[6,7],zmm0[0,1,2,3,4,5]
; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm1[30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
@@ -1157,11 +1150,10 @@ define <64 x i8> @shuffle_v64i8_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_
;
; AVX512DQ-LABEL: shuffle_v64i8_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124_125:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[0,1]
-; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm3[30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[0,1]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,4,5,4,5,6,7]
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm2[30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
+; AVX512DQ-NEXT: valignq {{.*#+}} zmm1 = zmm1[6,7],zmm0[0,1,2,3,4,5]
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13],ymm1[30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
@@ -1178,11 +1170,10 @@ define <64 x i8> @shuffle_v64i8_62_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_
define <64 x i8> @shuffle_v64i8_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124_125_126(<64 x i8> %a0, <64 x i8> %a1) {
; AVX512F-LABEL: shuffle_v64i8_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124_125_126:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[0,1]
-; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
-; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512F-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[0,1]
+; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,4,5,4,5,6,7]
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512F-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm1[6,7],zmm0[0,1,2,3,4,5]
; AVX512F-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512F-NEXT: retq
@@ -1195,11 +1186,10 @@ define <64 x i8> @shuffle_v64i8_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_
;
; AVX512DQ-LABEL: shuffle_v64i8_63_64_65_66_67_68_69_70_71_72_73_74_75_76_77_78_79_80_81_82_83_84_85_86_87_88_89_90_91_92_93_94_95_96_97_98_99_100_101_102_103_104_105_106_107_108_109_110_111_112_113_114_115_116_117_118_119_120_121_122_123_124_125_126:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[0,1]
-; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm3[31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm0[0,1]
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm0[2,3,4,5,4,5,6,7]
+; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3
+; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm2[31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
+; AVX512DQ-NEXT: valignq {{.*#+}} zmm1 = zmm1[6,7],zmm0[0,1,2,3,4,5]
; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14],ymm1[31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
@@ -1631,15 +1621,14 @@ define <64 x i8> @shuffle_v8i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01_0
define <64 x i8> @PR54562_ref(<64 x i8> %a0) {
; AVX512F-LABEL: PR54562_ref:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
+; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512F-NEXT: valignq {{.*#+}} zmm2 = zmm0[3,4,5,6,7,0,1,2]
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
; AVX512F-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: PR54562_ref:
@@ -1651,15 +1640,14 @@ define <64 x i8> @PR54562_ref(<64 x i8> %a0) {
;
; AVX512DQ-LABEL: PR54562_ref:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,1,2]
+; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512DQ-NEXT: valignq {{.*#+}} zmm2 = zmm0[3,4,5,6,7,0,1,2]
+; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,2]
; AVX512DQ-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: retq
;
; AVX512VBMI-LABEL: PR54562_ref:
@@ -1675,9 +1663,9 @@ define <64 x i8> @PR54562_ref(<64 x i8> %a0) {
define void @PR54562_mem(ptr %src, ptr %dst) {
; AVX512F-LABEL: PR54562_mem:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm0
-; AVX512F-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512F-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[3,4,5,6,7,0,1,2]
+; AVX512F-NEXT: vinserti128 $1, 32(%rdi), %ymm0, %ymm0
; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
; AVX512F-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2]
@@ -1698,9 +1686,9 @@ define void @PR54562_mem(ptr %src, ptr %dst) {
;
; AVX512DQ-LABEL: PR54562_mem:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0
-; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512DQ-NEXT: valignq {{.*#+}} zmm0 = zmm0[3,4,5,6,7,0,1,2]
+; AVX512DQ-NEXT: vinserti128 $1, 32(%rdi), %ymm0, %ymm0
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,2,1,4,3,5,4,7,6,8,7,10,9,11,10,5,4,6,5,8,7,9,8,11,10,12,11,14,13,15,14]
; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,1,2]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index fce98cd470bcd..dd33a5b7067c9 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -1836,8 +1836,7 @@ define <4 x double> @test_v8f64_2346 (<8 x double> %v) {
define <2 x double> @test_v8f64_34 (<8 x double> %v) {
; ALL-LABEL: test_v8f64_34:
; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbq {{.*#+}} xmm1 = [3,4]
-; ALL-NEXT: vpermpd %zmm0, %zmm1, %zmm0
+; ALL-NEXT: valignq {{.*#+}} zmm0 = zmm0[3,4,5,6,7,0,1,2]
; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: ret{{[l|q]}}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
index 545a9d3e314a2..a22fa94a0e879 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -547,53 +547,33 @@ define <16 x float> @test_masked_permps_v16f32(ptr %vp, <16 x float> %vec2) {
}
define void @test_demandedelts_pshufb_v32i8_v16i8(ptr %src, ptr %dst) {
-; X86-AVX512-SLOW-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
-; X86-AVX512-SLOW: # %bb.0:
-; X86-AVX512-SLOW-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-SLOW-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-SLOW-NEXT: vpbroadcastd 44(%ecx), %xmm0
-; X86-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-AVX512-SLOW-NEXT: vmovdqa %ymm0, 672(%eax)
-; X86-AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
-; X86-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-AVX512-SLOW-NEXT: vmovdqa %ymm0, 832(%eax)
-; X86-AVX512-SLOW-NEXT: vzeroupper
-; X86-AVX512-SLOW-NEXT: retl
-;
-; X64-AVX512-SLOW-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
-; X64-AVX512-SLOW: # %bb.0:
-; X64-AVX512-SLOW-NEXT: vpbroadcastd 44(%rdi), %xmm0
-; X64-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X64-AVX512-SLOW-NEXT: vmovdqa %ymm0, 672(%rsi)
-; X64-AVX512-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
-; X64-AVX512-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X64-AVX512-SLOW-NEXT: vmovdqa %ymm0, 832(%rsi)
-; X64-AVX512-SLOW-NEXT: vzeroupper
-; X64-AVX512-SLOW-NEXT: retq
-;
-; X86-AVX512-FAST-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
-; X86-AVX512-FAST: # %bb.0:
-; X86-AVX512-FAST-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-FAST-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-FAST-NEXT: vpbroadcastd 44(%ecx), %xmm0
-; X86-AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X86-AVX512-FAST-NEXT: vmovdqa %ymm0, 672(%eax)
-; X86-AVX512-FAST-NEXT: vmovdqa 208(%ecx), %xmm0
-; X86-AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
-; X86-AVX512-FAST-NEXT: vmovdqa %ymm0, 832(%eax)
-; X86-AVX512-FAST-NEXT: vzeroupper
-; X86-AVX512-FAST-NEXT: retl
-;
-; X64-AVX512-FAST-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
-; X64-AVX512-FAST: # %bb.0:
-; X64-AVX512-FAST-NEXT: vpbroadcastd 44(%rdi), %xmm0
-; X64-AVX512-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; X64-AVX512-FAST-NEXT: vmovdqa %ymm0, 672(%rsi)
-; X64-AVX512-FAST-NEXT: vmovdqa 208(%rdi), %xmm0
-; X64-AVX512-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,6,7,0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
-; X64-AVX512-FAST-NEXT: vmovdqa %ymm0, 832(%rsi)
-; X64-AVX512-FAST-NEXT: vzeroupper
-; X64-AVX512-FAST-NEXT: retq
+; X86-AVX512-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
+; X86-AVX512: # %bb.0:
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX512-NEXT: vpbroadcastd 44(%ecx), %xmm0
+; X86-AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-AVX512-NEXT: vmovdqa %ymm0, 672(%eax)
+; X86-AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [13,12,2,3]
+; X86-AVX512-NEXT: vpermi2d 192(%ecx), %ymm1, %ymm0
+; X86-AVX512-NEXT: vmovdqa %xmm0, %xmm0
+; X86-AVX512-NEXT: vmovdqa %ymm0, 832(%eax)
+; X86-AVX512-NEXT: vzeroupper
+; X86-AVX512-NEXT: retl
+;
+; X64-AVX512-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
+; X64-AVX512: # %bb.0:
+; X64-AVX512-NEXT: vpbroadcastd 44(%rdi), %xmm0
+; X64-AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X64-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT: vmovdqa %ymm0, 672(%rsi)
+; X64-AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [13,12,2,3]
+; X64-AVX512-NEXT: vpermi2d 192(%rdi), %ymm1, %ymm0
+; X64-AVX512-NEXT: vmovdqa %xmm0, %xmm0
+; X64-AVX512-NEXT: vmovdqa %ymm0, 832(%rsi)
+; X64-AVX512-NEXT: vzeroupper
+; X64-AVX512-NEXT: retq
;
; X86-AVX512F-LABEL: test_demandedelts_pshufb_v32i8_v16i8:
; X86-AVX512F: # %bb.0:
@@ -602,8 +582,11 @@ define void @test_demandedelts_pshufb_v32i8_v16i8(ptr %src, ptr %dst) {
; X86-AVX512F-NEXT: vpbroadcastd 44(%ecx), %xmm0
; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; X86-AVX512F-NEXT: vmovdqa %ymm0, 672(%eax)
-; X86-AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
-; X86-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X86-AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [21,20,2,3]
+; X86-AVX512F-NEXT: vmovdqa 192(%ecx), %ymm1
+; X86-AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X86-AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm2
+; X86-AVX512F-NEXT: vmovdqa %xmm2, %xmm0
; X86-AVX512F-NEXT: vmovdqa %ymm0, 832(%eax)
; X86-AVX512F-NEXT: vzeroupper
; X86-AVX512F-NEXT: retl
@@ -613,8 +596,11 @@ define void @test_demandedelts_pshufb_v32i8_v16i8(ptr %src, ptr %dst) {
; X64-AVX512F-NEXT: vpbroadcastd 44(%rdi), %xmm0
; X64-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
; X64-AVX512F-NEXT: vmovdqa %ymm0, 672(%rsi)
-; X64-AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[1,0,2,3]
-; X64-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; X64-AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [21,20,2,3]
+; X64-AVX512F-NEXT: vmovdqa 192(%rdi), %ymm1
+; X64-AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-AVX512F-NEXT: vpermt2d %zmm1, %zmm0, %zmm2
+; X64-AVX512F-NEXT: vmovdqa %xmm2, %xmm0
; X64-AVX512F-NEXT: vmovdqa %ymm0, 832(%rsi)
; X64-AVX512F-NEXT: vzeroupper
; X64-AVX512F-NEXT: retq
@@ -689,3 +675,8 @@ entry:
store <16 x float> %1, ptr %a0, align 64
ret void
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; X64-AVX512-FAST: {{.*}}
+; X64-AVX512-SLOW: {{.*}}
+; X86-AVX512-FAST: {{.*}}
+; X86-AVX512-SLOW: {{.*}}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 2df013d0ff3e3..c80c85c03093e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -140,10 +140,10 @@ define <8 x float> @combine_vpermilvar_vperm2f128_zero_8f32(<8 x float> %a0) {
; AVX512-LABEL: combine_vpermilvar_vperm2f128_zero_8f32:
; AVX512: # %bb.0:
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0]
-; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0
-; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [8,9,0,1]
+; AVX512-NEXT: vxorpd %xmm2, %xmm2, %xmm2
+; AVX512-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0
+; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: ret{{[l|q]}}
%1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>)
%2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> <i32 8, i32 8, i32 8, i32 8, i32 0, i32 1, i32 2, i32 3>
@@ -502,9 +502,9 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]
; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
; X86-AVX2-NEXT: vmovapd %ymm3, (%ecx)
-; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X86-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
; X86-AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; X86-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; X86-AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
; X86-AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
; X86-AVX2-NEXT: vmovapd %ymm0, (%eax)
; X86-AVX2-NEXT: vzeroupper
@@ -518,21 +518,23 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
-; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3
-; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1]
-; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4
-; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
-; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
-; X86-AVX512-NEXT: vmovapd %ymm4, (%edx)
-; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
-; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4
-; X86-AVX512-NEXT: vmovapd %ymm4, (%ecx)
+; X86-AVX512-NEXT: vbroadcastsd %xmm1, %ymm3
+; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm3[3]
+; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,8,9]
+; X86-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm5
+; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
+; X86-AVX512-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
+; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
+; X86-AVX512-NEXT: vmovapd %ymm3, (%edx)
+; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm4[2,3]
+; X86-AVX512-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,3,2,1]
+; X86-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
+; X86-AVX512-NEXT: vmovapd %ymm3, (%ecx)
; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [3,11,0,0]
-; X86-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3
-; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3]
-; X86-AVX512-NEXT: vpermi2pd %zmm3, %zmm2, %zmm0
-; X86-AVX512-NEXT: vmovapd %ymm0, (%eax)
+; X86-AVX512-NEXT: vpermt2pd %zmm1, %zmm3, %zmm0
+; X86-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [2,8,9,3]
+; X86-AVX512-NEXT: vpermi2pd %zmm0, %zmm2, %zmm1
+; X86-AVX512-NEXT: vmovapd %ymm1, (%eax)
; X86-AVX512-NEXT: vzeroupper
; X86-AVX512-NEXT: retl
;
@@ -572,9 +574,9 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,0]
; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
; X64-AVX2-NEXT: vmovapd %ymm3, (%rsi)
-; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3]
; X64-AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
-; X64-AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
+; X64-AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
; X64-AVX2-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
; X64-AVX2-NEXT: vmovapd %ymm0, (%rdx)
; X64-AVX2-NEXT: vzeroupper
@@ -585,21 +587,23 @@ define void @PR48908(<4 x double> %v0, <4 x double> %v1, <4 x double> %v2, ptr n
; X64-AVX512-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; X64-AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; X64-AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [1,2,8,9]
-; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm3
-; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,8,2,1]
-; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm4
-; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm2[0,1]
-; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3]
-; X64-AVX512-NEXT: vmovapd %ymm4, (%rdi)
-; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm4 = [0,3,10,1]
-; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm3, %zmm4
-; X64-AVX512-NEXT: vmovapd %ymm4, (%rsi)
+; X64-AVX512-NEXT: vbroadcastsd %xmm1, %ymm3
+; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm3[3]
+; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm5 = [1,2,8,9]
+; X64-AVX512-NEXT: vpermi2pd %zmm2, %zmm1, %zmm5
+; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm0[0,1],ymm2[0,1]
+; X64-AVX512-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3],ymm0[0,1]
+; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3]
+; X64-AVX512-NEXT: vmovapd %ymm3, (%rdi)
+; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm5[0,1],ymm4[2,3]
+; X64-AVX512-NEXT: vpermpd {{.*#+}} ymm4 = ymm5[0,3,2,1]
+; X64-AVX512-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3]
+; X64-AVX512-NEXT: vmovapd %ymm3, (%rsi)
; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [3,11]
-; X64-AVX512-NEXT: vpermi2pd %zmm1, %zmm0, %zmm3
-; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm0 = [2,8,9,3]
-; X64-AVX512-NEXT: vpermi2pd %zmm3, %zmm2, %zmm0
-; X64-AVX512-NEXT: vmovapd %ymm0, (%rdx)
+; X64-AVX512-NEXT: vpermt2pd %zmm1, %zmm3, %zmm0
+; X64-AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [2,8,9,3]
+; X64-AVX512-NEXT: vpermi2pd %zmm0, %zmm2, %zmm1
+; X64-AVX512-NEXT: vmovapd %ymm1, (%rdx)
; X64-AVX512-NEXT: vzeroupper
; X64-AVX512-NEXT: retq
%t0 = shufflevector <4 x double> %v0, <4 x double> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 68040b58858a7..d7807b0c821ab 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1566,23 +1566,14 @@ define <4 x i32> @combine_test21(<8 x i32> %a, ptr %ptr) {
; SSE-NEXT: movaps %xmm2, (%rdi)
; SSE-NEXT: retq
;
-; AVX1-LABEL: combine_test21:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovaps %xmm2, (%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: combine_test21:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[1,3,2,3]
-; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vmovaps %xmm0, (%rdi)
-; AVX2-NEXT: vmovaps %xmm1, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX-LABEL: combine_test21:
+; AVX: # %bb.0:
+; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
+; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX-NEXT: vmovaps %xmm2, (%rdi)
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%1 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%2 = shufflevector <8 x i32> %a, <8 x i32> %a, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
store <4 x i32> %1, ptr %ptr, align 16
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
index 497f71aea2227..0b00af60b56f7 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
@@ -375,46 +375,46 @@ define <64 x i8> @f3(ptr %p0) {
;
; AVX512BW-LABEL: f3:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128]
; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm2
+; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm2
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14]
; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm4
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm4
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb %xmm6, %xmm2, %xmm2
-; AVX512BW-NEXT: vpor %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm4
-; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
-; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX512BW-NEXT: vpshufb %ymm7, %ymm4, %ymm4
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm4[5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
+; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm2
+; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
+; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1]
+; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm2
; AVX512BW-NEXT: movl $-2097152, %eax # imm = 0xFFE00000
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm0, %ymm2 {%k1}
-; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm4
-; AVX512BW-NEXT: vpshufb %xmm6, %xmm4, %xmm4
-; AVX512BW-NEXT: vpor %xmm0, %xmm4, %xmm0
-; AVX512BW-NEXT: vmovdqa 160(%rdi), %xmm4
-; AVX512BW-NEXT: vpshufb %xmm1, %xmm4, %xmm1
-; AVX512BW-NEXT: vmovdqa 176(%rdi), %xmm4
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm4, %xmm3
-; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vpblendmb %ymm0, %ymm2, %ymm0 {%k1}
+; AVX512BW-NEXT: vmovdqa 112(%rdi), %xmm5
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb %xmm6, %xmm5, %xmm5
+; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm7
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm8 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm7, %xmm7
+; AVX512BW-NEXT: vpor %xmm5, %xmm7, %xmm5
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3,4],xmm2[5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm2
+; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
+; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm2
+; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm3
-; AVX512BW-NEXT: vpshufb %ymm7, %ymm3, %ymm3
-; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm3 {%k1}
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm2
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3
+; AVX512BW-NEXT: vpshufb %xmm6, %xmm3, %xmm3
+; AVX512BW-NEXT: vpshufb %xmm8, %xmm2, %xmm2
+; AVX512BW-NEXT: vpor %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm3
+; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm3
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7]
+; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; AVX512BW-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1}
; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index 732cc445ddcd8..656892c979320 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -75,27 +75,13 @@ define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
-; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_ashr:
-; AVX2-FAST-ALL: # %bb.0: # %entry
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7]
-; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_ashr:
-; AVX2-FAST-PERLANE: # %bb.0: # %entry
-; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX2-FAST-PERLANE-NEXT: retq
+; AVX2-LABEL: trunc8i64_8i32_ashr:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7]
+; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i32_ashr:
; AVX512: # %bb.0: # %entry
@@ -123,27 +109,13 @@ define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
-; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-ALL-LABEL: trunc8i64_8i32_lshr:
-; AVX2-FAST-ALL: # %bb.0: # %entry
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7]
-; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FAST-ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-ALL-NEXT: retq
-;
-; AVX2-FAST-PERLANE-LABEL: trunc8i64_8i32_lshr:
-; AVX2-FAST-PERLANE: # %bb.0: # %entry
-; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
-; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX2-FAST-PERLANE-NEXT: retq
+; AVX2-LABEL: trunc8i64_8i32_lshr:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [1,3,5,7]
+; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpermps %ymm1, %ymm2, %ymm1
+; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT: retq
;
; AVX512-LABEL: trunc8i64_8i32_lshr:
; AVX512: # %bb.0: # %entry
diff --git a/llvm/test/CodeGen/X86/vselect-packss.ll b/llvm/test/CodeGen/X86/vselect-packss.ll
index 5b14e2782ee1c..1a5af351b7fc9 100644
--- a/llvm/test/CodeGen/X86/vselect-packss.ll
+++ b/llvm/test/CodeGen/X86/vselect-packss.ll
@@ -422,8 +422,7 @@ define <16 x i8> @vselect_packss(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2,
; AVX512BWNOVL-LABEL: vselect_packss:
; AVX512BWNOVL: # %bb.0:
; AVX512BWNOVL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX512BWNOVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWNOVL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX512BWNOVL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512BWNOVL-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0
; AVX512BWNOVL-NEXT: vzeroupper
; AVX512BWNOVL-NEXT: retq
@@ -431,8 +430,7 @@ define <16 x i8> @vselect_packss(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2,
; AVX512BWVL-LABEL: vselect_packss:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
; AVX512BWVL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm3 ^ (xmm0 & (xmm2 ^ xmm3))
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/widen_fadd.ll b/llvm/test/CodeGen/X86/widen_fadd.ll
index c3700189d3d0e..6b98c108cbc68 100644
--- a/llvm/test/CodeGen/X86/widen_fadd.ll
+++ b/llvm/test/CodeGen/X86/widen_fadd.ll
@@ -217,11 +217,11 @@ define void @widen_fadd_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
; AVX512F-NEXT: vmovupd %zmm0, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -252,16 +252,17 @@ define void @widen_fadd_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
; AVX512VL-NEXT: vaddps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
-; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2
-; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
-; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3
-; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
+; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
+; AVX512VL-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512VL-NEXT: vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
; AVX512VL-NEXT: vmovupd %zmm0, (%rdx)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/widen_fdiv.ll b/llvm/test/CodeGen/X86/widen_fdiv.ll
index fd251a99ca500..8b1f27a62cb66 100644
--- a/llvm/test/CodeGen/X86/widen_fdiv.ll
+++ b/llvm/test/CodeGen/X86/widen_fdiv.ll
@@ -155,34 +155,34 @@ define void @widen_fdiv_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX512F-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero
; AVX512F-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero
; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vdivps %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vdivps %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vdivps %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
-; AVX512F-NEXT: vdivps %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm4 = mem[0],zero
; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512F-NEXT: vdivps %xmm5, %xmm4, %xmm4
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm5 = mem[0],zero
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
-; AVX512F-NEXT: vdivps %xmm6, %xmm5, %xmm5
; AVX512F-NEXT: vmovsd {{.*#+}} xmm6 = mem[0],zero
; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
-; AVX512F-NEXT: vdivps %xmm7, %xmm6, %xmm6
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
; AVX512F-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
-; AVX512F-NEXT: vdivps %xmm8, %xmm7, %xmm7
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
-; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
-; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm9 = mem[0],zero
+; AVX512F-NEXT: vdivps %xmm9, %xmm8, %xmm8
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm9 = mem[0],zero
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm10 = mem[0],zero
+; AVX512F-NEXT: vdivps %xmm10, %xmm9, %xmm9
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm10 = mem[0],zero
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero
+; AVX512F-NEXT: vdivps %xmm11, %xmm10, %xmm10
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm11 = mem[0],zero
+; AVX512F-NEXT: vmovsd {{.*#+}} xmm12 = mem[0],zero
+; AVX512F-NEXT: vdivps %xmm12, %xmm11, %xmm11
+; AVX512F-NEXT: vinsertf32x4 $1, %xmm11, %zmm10, %zmm10
+; AVX512F-NEXT: vinsertf32x4 $1, %xmm9, %zmm8, %zmm8
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [0,2,8,10,0,2,8,10]
+; AVX512F-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT: vpermt2pd %zmm10, %zmm9, %zmm8
+; AVX512F-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6
+; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vdivps %ymm6, %ymm2, %ymm2
+; AVX512F-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm3
+; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT: vdivps %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpermt2pd %zmm2, %zmm9, %zmm0
+; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm8, %zmm0
; AVX512F-NEXT: vmovupd %zmm0, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/widen_fmul.ll b/llvm/test/CodeGen/X86/widen_fmul.ll
index 6c3e0ff5a9bcd..56e81fed5af8a 100644
--- a/llvm/test/CodeGen/X86/widen_fmul.ll
+++ b/llvm/test/CodeGen/X86/widen_fmul.ll
@@ -217,11 +217,11 @@ define void @widen_fmul_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
; AVX512F-NEXT: vmovupd %zmm0, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -252,16 +252,17 @@ define void @widen_fmul_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
; AVX512VL-NEXT: vmulps %xmm7, %xmm8, %xmm7
-; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
-; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2
-; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
-; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3
-; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
+; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
+; AVX512VL-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512VL-NEXT: vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
; AVX512VL-NEXT: vmovupd %zmm0, (%rdx)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/widen_fsub.ll b/llvm/test/CodeGen/X86/widen_fsub.ll
index 7405d9b7b1c65..3c5a99a342f47 100644
--- a/llvm/test/CodeGen/X86/widen_fsub.ll
+++ b/llvm/test/CodeGen/X86/widen_fsub.ll
@@ -217,11 +217,11 @@ define void @widen_fsub_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX512F-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
; AVX512F-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-NEXT: vpermt2pd %zmm6, %zmm5, %zmm4
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpermt2pd %zmm2, %zmm5, %zmm0
-; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm4, %zmm0
+; AVX512F-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512F-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512F-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512F-NEXT: vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
; AVX512F-NEXT: vmovupd %zmm0, (%rdx)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
@@ -252,16 +252,17 @@ define void @widen_fsub_v2f32_v16f32(ptr %a0, ptr %b0, ptr %c0) {
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm7 = mem[0],zero
; AVX512VL-NEXT: vmovsd {{.*#+}} xmm8 = mem[0],zero
; AVX512VL-NEXT: vsubps %xmm8, %xmm7, %xmm7
-; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,2,4,6]
-; AVX512VL-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm0
-; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm2
-; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [0,2,8,10,0,2,8,10]
-; AVX512VL-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT: vpermi2pd %zmm0, %zmm2, %zmm3
-; AVX512VL-NEXT: vinsertf64x4 $0, %ymm1, %zmm3, %zmm0
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm7, %zmm6, %zmm6
+; AVX512VL-NEXT: vinsertf32x4 $1, %xmm5, %zmm4, %zmm4
+; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,2,8,10,0,2,8,10]
+; AVX512VL-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT: vpermi2pd %zmm6, %zmm4, %zmm5
+; AVX512VL-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX512VL-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
+; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX512VL-NEXT: vinsertf64x4 $0, %ymm0, %zmm5, %zmm0
; AVX512VL-NEXT: vmovupd %zmm0, (%rdx)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index a01e6ca4b175d..43fc3ca4e1d47 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1275,116 +1275,122 @@ ret void
define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
; AVX1-LABEL: interleaved_load_vf64_i8_stride3:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqu (%rdi), %xmm11
+; AVX1-NEXT: subq $40, %rsp
+; AVX1-NEXT: .cfi_def_cfa_offset 48
+; AVX1-NEXT: vmovdqu (%rdi), %xmm8
; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqu 48(%rdi), %xmm13
-; AVX1-NEXT: vmovups 64(%rdi), %xmm0
+; AVX1-NEXT: vmovdqu 64(%rdi), %xmm2
+; AVX1-NEXT: vmovups 80(%rdi), %xmm0
; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqu 80(%rdi), %xmm4
; AVX1-NEXT: vmovdqu 96(%rdi), %xmm5
-; AVX1-NEXT: vmovdqu 112(%rdi), %xmm2
-; AVX1-NEXT: vmovdqu 144(%rdi), %xmm10
-; AVX1-NEXT: vmovdqu 160(%rdi), %xmm3
-; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14]
-; AVX1-NEXT: vpshufb %xmm9, %xmm5, %xmm6
-; AVX1-NEXT: vpshufb %xmm9, %xmm10, %xmm7
-; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm8
-; AVX1-NEXT: vpshufb %xmm9, %xmm13, %xmm9
+; AVX1-NEXT: vmovdqu 112(%rdi), %xmm3
+; AVX1-NEXT: vmovdqu 144(%rdi), %xmm7
+; AVX1-NEXT: vmovdqu 160(%rdi), %xmm4
+; AVX1-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,0,3,6,9,12,15,2,5,8,11,14]
+; AVX1-NEXT: vpshufb %xmm11, %xmm5, %xmm6
+; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm9
+; AVX1-NEXT: vpshufb %xmm11, %xmm8, %xmm10
+; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm11
; AVX1-NEXT: vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u]
; AVX1-NEXT: vpshufb %xmm14, %xmm5, %xmm5
; AVX1-NEXT: vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u]
-; AVX1-NEXT: vpshufb %xmm15, %xmm2, %xmm12
-; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufb %xmm15, %xmm3, %xmm12
+; AVX1-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
; AVX1-NEXT: vpor %xmm5, %xmm12, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm14, %xmm10, %xmm10
-; AVX1-NEXT: vpshufb %xmm15, %xmm3, %xmm12
-; AVX1-NEXT: vpor %xmm10, %xmm12, %xmm0
+; AVX1-NEXT: vpshufb %xmm14, %xmm7, %xmm7
+; AVX1-NEXT: vpshufb %xmm15, %xmm4, %xmm12
+; AVX1-NEXT: vpor %xmm7, %xmm12, %xmm0
; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm14, %xmm11, %xmm11
-; AVX1-NEXT: vmovdqa %xmm1, %xmm0
+; AVX1-NEXT: vpshufb %xmm14, %xmm8, %xmm8
; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm12
-; AVX1-NEXT: vpor %xmm11, %xmm12, %xmm1
-; AVX1-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpshufb %xmm14, %xmm13, %xmm11
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm13
-; AVX1-NEXT: vpor %xmm11, %xmm13, %xmm11
-; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm13
-; AVX1-NEXT: vpshufb %xmm15, %xmm4, %xmm5
-; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm5
-; AVX1-NEXT: vmovdqu 32(%rdi), %xmm1
-; AVX1-NEXT: vpshufb %xmm14, %xmm0, %xmm13
-; AVX1-NEXT: vpshufb %xmm15, %xmm1, %xmm10
-; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10
-; AVX1-NEXT: vmovdqu 176(%rdi), %xmm13
-; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm0
-; AVX1-NEXT: vpshufb %xmm15, %xmm13, %xmm12
-; AVX1-NEXT: vpor %xmm0, %xmm12, %xmm3
-; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm12
+; AVX1-NEXT: vpor %xmm8, %xmm12, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufb %xmm14, %xmm13, %xmm8
+; AVX1-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufb %xmm15, %xmm2, %xmm13
+; AVX1-NEXT: vpor %xmm8, %xmm13, %xmm0
+; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufb %xmm14, %xmm2, %xmm13
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm15, %xmm2, %xmm5
+; AVX1-NEXT: vpor %xmm5, %xmm13, %xmm8
+; AVX1-NEXT: vmovdqu 32(%rdi), %xmm13
+; AVX1-NEXT: vpshufb %xmm14, %xmm1, %xmm0
+; AVX1-NEXT: vpshufb %xmm15, %xmm13, %xmm7
+; AVX1-NEXT: vpor %xmm0, %xmm7, %xmm5
+; AVX1-NEXT: vmovdqu 176(%rdi), %xmm7
+; AVX1-NEXT: vpshufb %xmm14, %xmm4, %xmm0
+; AVX1-NEXT: vpshufb %xmm15, %xmm7, %xmm12
+; AVX1-NEXT: vpor %xmm0, %xmm12, %xmm1
+; AVX1-NEXT: vpshufb %xmm14, %xmm3, %xmm12
; AVX1-NEXT: vmovdqu 128(%rdi), %xmm14
; AVX1-NEXT: vpshufb %xmm15, %xmm14, %xmm15
; AVX1-NEXT: vpor %xmm12, %xmm15, %xmm15
; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [1,4,7,10,13,128,128,128,128,128,128,128,128,128,128,128]
; AVX1-NEXT: vpshufb %xmm0, %xmm14, %xmm12
; AVX1-NEXT: vpor %xmm6, %xmm12, %xmm12
-; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm15
-; AVX1-NEXT: vpor %xmm7, %xmm15, %xmm15
-; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vpor %xmm2, %xmm8, %xmm2
-; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpor %xmm0, %xmm9, %xmm0
-; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
-; AVX1-NEXT: vpshufb %xmm9, %xmm11, %xmm10
+; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX1-NEXT: vpshufb %xmm0, %xmm7, %xmm15
+; AVX1-NEXT: vpor %xmm15, %xmm9, %xmm15
+; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpshufb %xmm0, %xmm13, %xmm1
+; AVX1-NEXT: vpor %xmm1, %xmm10, %xmm3
+; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm11, %xmm1
+; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm11[11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7,8,9,10]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm10, %xmm0, %xmm8
; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,128,128,128,128,128,2,5,8,11,14]
-; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4
-; AVX1-NEXT: vpor %xmm4, %xmm10, %xmm4
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10
-; AVX1-NEXT: vpshufb %xmm11, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm10, %xmm1
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm10
+; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm4
+; AVX1-NEXT: vpor %xmm4, %xmm8, %xmm4
+; AVX1-NEXT: vpmovsxdq {{.*#+}} xmm8 = [18446744073709551615,16777215]
+; AVX1-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload
; AVX1-NEXT: vpshufb %xmm11, %xmm13, %xmm13
-; AVX1-NEXT: vpor %xmm13, %xmm10, %xmm10
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm9, %xmm3, %xmm9
-; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm11
-; AVX1-NEXT: vpor %xmm11, %xmm9, %xmm9
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
-; AVX1-NEXT: vpshufb %xmm11, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm13 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3
-; AVX1-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3
-; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm8, %xmm1
-; AVX1-NEXT: vpshufb %xmm11, %xmm15, %xmm2
+; AVX1-NEXT: vpor %xmm0, %xmm13, %xmm0
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm10, %xmm2, %xmm10
+; AVX1-NEXT: vpshufb %xmm11, %xmm7, %xmm7
+; AVX1-NEXT: vpor %xmm7, %xmm10, %xmm7
+; AVX1-NEXT: vpand {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
+; AVX1-NEXT: vpshufb %xmm11, %xmm14, %xmm10
+; AVX1-NEXT: vpor %xmm10, %xmm8, %xmm8
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm10 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4]
+; AVX1-NEXT: vpshufb %xmm10, %xmm1, %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm11 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128]
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm13
+; AVX1-NEXT: vpor %xmm1, %xmm13, %xmm1
+; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpaddb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vpshufb %xmm10, %xmm3, %xmm3
+; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm11, %xmm2, %xmm4
+; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpaddb %xmm0, %xmm6, %xmm0
+; AVX1-NEXT: vpshufb %xmm10, %xmm15, %xmm2
; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm13, %xmm3, %xmm3
+; AVX1-NEXT: vpshufb %xmm11, %xmm3, %xmm3
; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpaddb %xmm2, %xmm10, %xmm2
-; AVX1-NEXT: vpaddb %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vpshufb %xmm11, %xmm12, %xmm3
-; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX1-NEXT: vpshufb %xmm13, %xmm4, %xmm4
+; AVX1-NEXT: vpaddb %xmm7, %xmm2, %xmm2
+; AVX1-NEXT: vpaddb %xmm2, %xmm9, %xmm2
+; AVX1-NEXT: vpshufb %xmm10, %xmm12, %xmm3
+; AVX1-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload
+; AVX1-NEXT: vpshufb %xmm11, %xmm4, %xmm4
; AVX1-NEXT: vpor %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpaddb %xmm3, %xmm9, %xmm3
-; AVX1-NEXT: vpaddb %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpaddb %xmm3, %xmm8, %xmm3
+; AVX1-NEXT: vpaddb {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1
+; AVX1-NEXT: addq $40, %rsp
+; AVX1-NEXT: .cfi_def_cfa_offset 8
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleaved_load_vf64_i8_stride3:
@@ -1889,11 +1895,19 @@ define void @splat4_v4i64_load_store(ptr %s, ptr %d) nounwind {
}
define <2 x i64> @PR37616(ptr %a0) nounwind {
-; AVX-LABEL: PR37616:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps 16(%rdi), %xmm0
-; AVX-NEXT: vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0]
-; AVX-NEXT: retq
+; AVX1OR2-LABEL: PR37616:
+; AVX1OR2: # %bb.0:
+; AVX1OR2-NEXT: vmovaps 16(%rdi), %xmm0
+; AVX1OR2-NEXT: vunpcklpd 48(%rdi), %xmm0, %xmm0 # xmm0 = xmm0[0],mem[0]
+; AVX1OR2-NEXT: retq
+;
+; AVX512-LABEL: PR37616:
+; AVX512: # %bb.0:
+; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [2,6]
+; AVX512-NEXT: vpermpd (%rdi), %zmm0, %zmm0
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%load = load <16 x i64>, ptr %a0, align 128
%shuffle = shufflevector <16 x i64> %load, <16 x i64> undef, <2 x i32> <i32 2, i32 6>
ret <2 x i64> %shuffle
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index eb463837c3bb8..10f2e99014d6e 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -5409,20 +5409,36 @@ define void @vec384_v6i64_to_v3i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
-; AVX512BW-LABEL: vec384_v6i64_to_v3i128_factor2:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT: movb $5, %al
-; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z}
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512BW-SLOW-LABEL: vec384_v6i64_to_v3i128_factor2:
+; AVX512BW-SLOW: # %bb.0:
+; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512BW-SLOW-NEXT: movb $5, %al
+; AVX512BW-SLOW-NEXT: kmovd %eax, %k1
+; AVX512BW-SLOW-NEXT: vpexpandq %ymm0, %ymm1 {%k1} {z}
+; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT: vzeroupper
+; AVX512BW-SLOW-NEXT: retq
+;
+; AVX512BW-FAST-LABEL: vec384_v6i64_to_v3i128_factor2:
+; AVX512BW-FAST: # %bb.0:
+; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} xmm0 = [2,5]
+; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512BW-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-FAST-NEXT: vpermi2q %ymm2, %ymm1, %ymm0
+; AVX512BW-FAST-NEXT: movb $5, %al
+; AVX512BW-FAST-NEXT: kmovd %eax, %k1
+; AVX512BW-FAST-NEXT: vpexpandq %ymm1, %ymm1 {%k1} {z}
+; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT: vzeroupper
+; AVX512BW-FAST-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -7188,23 +7204,23 @@ define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
+; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
+; AVX-NEXT: vmovshdup {{.*#+}} ymm3 = ymm0[1,1,3,3,5,5,7,7]
+; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0
; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3
-; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2
-; AVX-NEXT: vmovdqa %xmm2, (%rcx)
-; AVX-NEXT: vmovdqa %xmm3, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm0, 32(%rcx)
-; AVX-NEXT: vmovdqa %xmm1, 48(%rcx)
+; AVX-NEXT: vpaddb 48(%rdx), %xmm3, %xmm3
+; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
+; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX-NEXT: vmovdqa %xmm3, 48(%rcx)
+; AVX-NEXT: vmovdqa %xmm0, (%rcx)
+; AVX-NEXT: vmovdqa %xmm1, 16(%rcx)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index 35f25d36cb2e9..e321b7e8054ea 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -747,31 +747,23 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax
-; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
-; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7]
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vpinsrw $2, %eax, %xmm2, %xmm0
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7]
+; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7]
+; AVX512BW-NEXT: vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -865,31 +857,19 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
-; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm2, %xmm0
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -941,73 +921,91 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
; AVX-NEXT: vmovdqa %xmm0, (%rcx)
; AVX-NEXT: retq
;
-; AVX2-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
+; AVX2-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa (%rdi), %ymm0
+; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX2-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
;
-; AVX2-FAST-PERLANE-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-FAST-PERLANE-NEXT: vzeroupper
-; AVX2-FAST-PERLANE-NEXT: retq
+; AVX512F-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512F-SLOW: # %bb.0:
+; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512F-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX512F-SLOW-NEXT: vpinsrd $2, %eax, %xmm2, %xmm0
+; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-SLOW-NEXT: vzeroupper
+; AVX512F-SLOW-NEXT: retq
;
-; AVX2-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
-; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
+; AVX512F-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512F-FAST: # %bb.0:
+; AVX512F-FAST-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX512F-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-FAST-NEXT: vzeroupper
+; AVX512F-FAST-NEXT: retq
;
-; AVX512F-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vpermd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX512DQ-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512DQ-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX512DQ-SLOW-NEXT: vpinsrd $2, %eax, %xmm2, %xmm0
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
;
-; AVX512DQ-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpermd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512DQ-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX512DQ-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
+; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
;
-; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
-; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512BW-SLOW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512BW-SLOW: # %bb.0:
+; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX512BW-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-SLOW-NEXT: vpinsrd $2, %eax, %xmm2, %xmm0
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
+; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT: vzeroupper
+; AVX512BW-SLOW-NEXT: retq
+;
+; AVX512BW-FAST-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
+; AVX512BW-FAST: # %bb.0:
+; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
+; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT: vzeroupper
+; AVX512BW-FAST-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2011,28 +2009,16 @@ define void @vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4(ptr %in.
; AVX512DQ-FAST-NEXT: vzeroupper
; AVX512DQ-FAST-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec256_i32_widen_to_i64_factor2_broadcast_to_v4i64_factor4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,0,11,0,13,0,15]
+; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2163,28 +2149,16 @@ define void @vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2(ptr %i
; AVX512DQ-FAST-NEXT: vzeroupper
; AVX512DQ-FAST-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec256_i32_widen_to_i128_factor4_broadcast_to_v2i128_factor2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,9,10,11,0,13,14,15]
+; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2315,28 +2289,16 @@ define void @vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2(ptr %i
; AVX512DQ-FAST-NEXT: vzeroupper
; AVX512DQ-FAST-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0
-; AVX512BW-SLOW-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec256_i64_widen_to_i128_factor2_broadcast_to_v2i128_factor2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,5,0,7]
+; AVX512BW-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -2434,57 +2396,109 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX512F-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512F-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX512F-SLOW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX512F-SLOW: # %bb.0:
+; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512F-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512F-SLOW-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512F-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-SLOW-NEXT: vzeroupper
+; AVX512F-SLOW-NEXT: retq
;
-; AVX512DQ-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX512DQ-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX512DQ-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512F-FAST-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX512F-FAST: # %bb.0:
+; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512F-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512F-FAST-NEXT: vpbroadcastb %xmm0, %ymm2
+; AVX512F-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
+; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512F-FAST-NEXT: vzeroupper
+; AVX512F-FAST-NEXT: retq
;
-; AVX512BW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpbroadcastb %xmm0, %ymm0
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512DQ-SLOW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512DQ-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512DQ-SLOW-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512DQ-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT: vpbroadcastb %xmm0, %ymm2
+; AVX512DQ-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
+; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
+;
+; AVX512BW-SLOW-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX512BW-SLOW: # %bb.0:
+; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512BW-SLOW-NEXT: vpbroadcastb %xmm0, %ymm0
+; AVX512BW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
+; AVX512BW-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512BW-SLOW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT: vzeroupper
+; AVX512BW-SLOW-NEXT: retq
+;
+; AVX512BW-FAST-LABEL: vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24:
+; AVX512BW-FAST: # %bb.0:
+; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; AVX512BW-FAST-NEXT: vpbroadcastb %xmm0, %ymm2
+; AVX512BW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23]
+; AVX512BW-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero
+; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT: vzeroupper
+; AVX512BW-FAST-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -3561,86 +3575,160 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
; SSE42-NEXT: movdqa %xmm1, 16(%rcx)
; SSE42-NEXT: retq
;
-; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
-; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
-; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa %xmm1, (%rcx)
-; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
-; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
-; AVX-NEXT: retq
+; AVX-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa (%rdi), %xmm0
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7]
+; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1
+; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2
+; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0
+; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0
+; AVX-NEXT: vmovdqa %xmm1, (%rcx)
+; AVX-NEXT: vmovdqa %xmm0, 16(%rcx)
+; AVX-NEXT: vmovdqa %xmm2, 32(%rcx)
+; AVX-NEXT: retq
+;
+; AVX2-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX2-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX2-FAST-PERLANE: # %bb.0:
+; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX2-FAST-PERLANE-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-PERLANE-NEXT: vpbroadcastw %xmm0, %ymm2
+; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
+; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-FAST-PERLANE-NEXT: vzeroupper
+; AVX2-FAST-PERLANE-NEXT: retq
+;
+; AVX2-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX2-FAST-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vpbroadcastw %xmm0, %ymm2
+; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
+; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero,xmm0[0,1],zero,zero
+; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
+;
+; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX512F-SLOW: # %bb.0:
+; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512F-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512F-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512F-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
+; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512F-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512F-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512F-SLOW-NEXT: vzeroupper
+; AVX512F-SLOW-NEXT: retq
;
-; AVX2-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
-; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1
-; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa %ymm1, (%rcx)
-; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; AVX512F-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX512F-FAST: # %bb.0:
+; AVX512F-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX512F-FAST-NEXT: vpbroadcastw %xmm1, %ymm2
+; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
+; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[0,1],zero,zero,xmm1[0,1],zero,zero,xmm1[0,1],zero,zero
+; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512F-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512F-FAST-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512F-FAST-NEXT: vzeroupper
+; AVX512F-FAST-NEXT: retq
;
-; AVX512F-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512F-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX512F-NEXT: vpbroadcastw %xmm1, %ymm1
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512F-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512F-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512F-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1
+; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0
+; AVX512DQ-SLOW-NEXT: vpbroadcastw %xmm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512DQ-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX512DQ-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1
+; AVX512DQ-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx)
+; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rcx)
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
;
-; AVX512DQ-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-NEXT: vpaddb (%rsi), %xmm1, %xmm1
-; AVX512DQ-NEXT: vpbroadcastw %xmm1, %ymm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512DQ-NEXT: vpaddb (%rdx), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
-; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
-; AVX512DQ-NEXT: vmovdqa %ymm1, 32(%rcx)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: vmovdqa 32(%rdi), %ymm0
+; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512DQ-FAST-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %xmm1, %xmm1
+; AVX512DQ-FAST-NEXT: vpbroadcastw %xmm1, %ymm2
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
+; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1],zero,zero,xmm1[0,1],zero,zero,xmm1[0,1],zero,zero,xmm1[0,1],zero,zero
+; AVX512DQ-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, 32(%rcx)
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, (%rcx)
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
;
; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47]
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,0,27,0,29,0,31]
+; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7],ymm1[8],ymm2[9],ymm1[10],ymm2[11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX512BW-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -3839,14 +3927,16 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0]
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,0,28,29,0,31]
+; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14],ymm1[15]
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7]
-; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -4065,14 +4155,17 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47]
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,0,29,30,31,0,25,26,27,0,29,30,31]
+; AVX512BW-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7],ymm1[8],ymm2[9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
-; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -4270,14 +4363,16 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,0,31,40,41,42,43,0,45,46,47]
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,28,29,0,31]
+; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7,8,9,10,11],ymm1[12],ymm2[13,14,15]
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
; AVX512BW-SLOW-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7]
-; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -4418,20 +4513,38 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47]
-; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
-; AVX512BW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX512BW-SLOW: # %bb.0:
+; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,25,26,27,28,29,30,31]
+; AVX512BW-SLOW-NEXT: # ymm1 = mem[0,1,0,1]
+; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
+; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
+; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX512BW-SLOW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT: vzeroupper
+; AVX512BW-SLOW-NEXT: retq
+;
+; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX512BW-FAST: # %bb.0:
+; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,0,41,42,43,44,45,46,47]
+; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT: vzeroupper
+; AVX512BW-FAST-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -4557,18 +4670,32 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47]
-; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm0
-; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0
-; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
+; AVX512BW-SLOW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
+; AVX512BW-SLOW: # %bb.0:
+; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,25,26,27,28,29,30,31]
+; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0
+; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4],ymm1[5,6,7,8,9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-SLOW-NEXT: vzeroupper
+; AVX512BW-SLOW-NEXT: retq
+;
+; AVX512BW-FAST-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
+; AVX512BW-FAST: # %bb.0:
+; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,25,26,27,28,29,30,31,40,41,42,43,0,45,46,47]
+; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-FAST-NEXT: vpermi2w %zmm2, %zmm0, %zmm1
+; AVX512BW-FAST-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-FAST-NEXT: vzeroupper
+; AVX512BW-FAST-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -4742,13 +4869,12 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
-; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
+; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,0,15,0,21,0,23]
+; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
-; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -4759,13 +4885,10 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,0,15]
-; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6],ymm2[7]
-; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero
-; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,0,0,0,0]
+; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-FAST-NEXT: vzeroupper
; AVX512BW-FAST-NEXT: retq
@@ -4943,13 +5066,12 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX512BW-SLOW: # %bb.0:
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
-; AVX512BW-SLOW-NEXT: vpermd %zmm0, %zmm1, %zmm1
+; AVX512BW-SLOW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,0,20,21,0,23]
+; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-SLOW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
; AVX512BW-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3]
-; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
; AVX512BW-SLOW-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
@@ -4960,13 +5082,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
; AVX512BW-FAST: # %bb.0:
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,0]
-; AVX512BW-FAST-NEXT: vpermd %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7]
-; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FAST-NEXT: vinserti32x4 $2, %xmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-FAST-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,0,0,0,0]
+; AVX512BW-FAST-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
+; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm2, %zmm0
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-FAST-NEXT: vzeroupper
; AVX512BW-FAST-NEXT: retq
@@ -5229,10 +5348,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,13,14,15]
-; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7]
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,13,14,15,20,21,0,23]
+; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
; AVX512BW-NEXT: vzeroupper
@@ -5355,32 +5474,17 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0]
-; AVX512BW-SLOW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,0,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm1
-; AVX512BW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; AVX512BW-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} zmm2 = [0,7,0,11,0,13,0,0]
+; AVX512BW-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm2, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
@@ -5495,31 +5599,18 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-SLOW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,10,0]
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512BW-SLOW-NEXT: vpermt2q %zmm2, %zmm1, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, %ymm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,2,0]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpermq %zmm0, %zmm1, %zmm0
-; AVX512BW-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,10,0]
+; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512BW-NEXT: vpermi2q %zmm2, %zmm0, %zmm1
+; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
+; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec.base = load <64 x i8>, ptr %in.vec.base.ptr, align 64
%in.vec.bias = load <64 x i8>, ptr %in.vec.bias.ptr, align 64
%in.vec = add <64 x i8> %in.vec.base, %in.vec.bias
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index a598e30845579..f17a2e9979207 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -641,25 +641,16 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
-; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
-; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0
-; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [16,25,16,27,16,29,0,23]
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512BW-NEXT: vpermt2w (%rdi), %ymm0, %ymm1
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],mem[7]
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64
%in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
%broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 0, i32 11, i32 0, i32 13, i32 0, i32 15>
@@ -735,25 +726,18 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
;
-; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-SLOW: # %bb.0:
-; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7]
-; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
-; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-SLOW-NEXT: vzeroupper
-; AVX512BW-SLOW-NEXT: retq
-;
-; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
-; AVX512BW-FAST: # %bb.0:
-; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7]
-; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7]
-; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)
-; AVX512BW-FAST-NEXT: vzeroupper
-; AVX512BW-FAST-NEXT: retq
+; AVX512BW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512BW-NEXT: vmovd %xmm0, %eax
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
+; AVX512BW-NEXT: vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm1[5,6,7]
+; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
%in.vec = load <64 x i8>, ptr %in.elt.ptr, align 64
%in.vec.cast = bitcast <64 x i8> %in.vec to <32 x i16>
%broadcast.of.zextinreg = shufflevector <32 x i16> %in.vec.cast, <32 x i16> zeroinitializer, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 0, i32 13, i32 14, i32 15>
@@ -3199,16 +3183,17 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
;
; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX512F-SLOW: # %bb.0:
-; AVX512F-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %ymm1
-; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
-; AVX512F-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT: vpbroadcastw (%rdi), %ymm0
+; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX512F-SLOW-NEXT: vmovdqa %ymm1, (%rdx)
; AVX512F-SLOW-NEXT: vzeroupper
; AVX512F-SLOW-NEXT: retq
;
@@ -3229,16 +3214,17 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
;
; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6:
; AVX512DQ-SLOW: # %bb.0:
-; AVX512DQ-SLOW-NEXT: vmovdqa 48(%rdi), %xmm0
-; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %ymm1
-; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4],ymm0[5,6,7],ymm1[8],ymm0[9,10,11],ymm1[12],ymm0[13,14,15]
-; AVX512DQ-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vpbroadcastw (%rdi), %ymm0
+; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3],xmm0[4],mem[5,6,7]
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
-; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
-; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
-; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
+; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, (%rdx)
; AVX512DQ-SLOW-NEXT: vzeroupper
; AVX512DQ-SLOW-NEXT: retq
;
@@ -3501,35 +3487,69 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
-; AVX512F-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX512F-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,0,1]
-; AVX512F-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
-; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512F-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX512F-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
+; AVX512F-SLOW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX512F-SLOW: # %bb.0:
+; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX512F-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512F-SLOW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],mem[1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512F-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512F-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512F-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512F-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512F-SLOW-NEXT: vzeroupper
+; AVX512F-SLOW-NEXT: retq
;
-; AVX512DQ-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,0,1]
-; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
-; AVX512DQ-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb (%rsi), %ymm1, %ymm1
-; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa %ymm0, 32(%rdx)
-; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
+; AVX512F-FAST-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX512F-FAST: # %bb.0:
+; AVX512F-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-FAST-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
+; AVX512F-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512F-FAST-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX512F-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
+; AVX512F-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512F-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512F-FAST-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX512F-FAST-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX512F-FAST-NEXT: vzeroupper
+; AVX512F-FAST-NEXT: retq
+;
+; AVX512DQ-SLOW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX512DQ-SLOW: # %bb.0:
+; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = mem[0,1,0,1]
+; AVX512DQ-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
+; AVX512DQ-SLOW-NEXT: vmovdqa (%rdi), %xmm1
+; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0],mem[1,2,3,4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512DQ-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
+; AVX512DQ-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0
+; AVX512DQ-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
+; AVX512DQ-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx)
+; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx)
+; AVX512DQ-SLOW-NEXT: vzeroupper
+; AVX512DQ-SLOW-NEXT: retq
+;
+; AVX512DQ-FAST-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
+; AVX512DQ-FAST: # %bb.0:
+; AVX512DQ-FAST-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7]
+; AVX512DQ-FAST-NEXT: vmovdqa 48(%rdi), %xmm1
+; AVX512DQ-FAST-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,0,1]
+; AVX512DQ-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15]
+; AVX512DQ-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1
+; AVX512DQ-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm0, 32(%rdx)
+; AVX512DQ-FAST-NEXT: vmovdqa %ymm1, (%rdx)
+; AVX512DQ-FAST-NEXT: vzeroupper
+; AVX512DQ-FAST-NEXT: retq
;
; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
; AVX512BW: # %bb.0:
@@ -4326,7 +4346,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7]
; AVX-NEXT: vmovaps 32(%rsi), %ymm2
-; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0
; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1
; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
More information about the llvm-commits
mailing list