[llvm] r358359 - [X86] Move VPTESTM matching from the isel table to custom code in X86ISelDAGToDAG.
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 14 11:26:11 PDT 2019
Author: ctopper
Date: Sun Apr 14 11:26:11 2019
New Revision: 358359
URL: http://llvm.org/viewvc/llvm-project?rev=358359&view=rev
Log:
[X86] Move VPTESTM matching from the isel table to custom code in X86ISelDAGToDAG.
We had many tablegen patterns for these instructions. And due to the
commutability of the patterns, tablegen expands them to even more patterns. All
together VPTESTMD patterns accounted for more the 50K of the 610K isel table.
This had gotten bad when we stopped canonicalizing AND to vXi64. This required
a pattern for every combination of bitcast input type.
This change moves the matching to custom code where it is easier to look through
the bitcasts without being concerned with the specific types.
The test changes are because we are now stricter with one use checks as its
required to make load folding legal. We now require the AND and any BITCAST to
only have a single use. This prevents forming VPTESTM and a VPAND with the same
inputs.
We now support broadcast loads for 128/256 patterns without VLX. We'll widen to
512-bit like and still fold the broadcast since the amount of memory read
doesn't change.
There are a few tests that got slightly longer because are now prefering
load + VPTESTM over XOR+VPCMPEQ for (seteq (load), allzeros). Previously we were
able to share the XOR with multiple VPTESTM instructions.
Modified:
llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
llvm/trunk/lib/Target/X86/X86InstrAVX512.td
llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
llvm/trunk/test/CodeGen/X86/kshift.ll
llvm/trunk/test/CodeGen/X86/movmsk-cmp.ll
llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll
llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
llvm/trunk/test/CodeGen/X86/setcc-lowering.ll
llvm/trunk/test/CodeGen/X86/vector-fshl-128.ll
llvm/trunk/test/CodeGen/X86/vector-fshl-256.ll
llvm/trunk/test/CodeGen/X86/vector-fshl-512.ll
llvm/trunk/test/CodeGen/X86/vector-fshr-128.ll
llvm/trunk/test/CodeGen/X86/vector-fshr-256.ll
llvm/trunk/test/CodeGen/X86/vector-fshr-512.ll
llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelDAGToDAG.cpp Sun Apr 14 11:26:11 2019
@@ -483,6 +483,7 @@ namespace {
bool shrinkAndImmediate(SDNode *N);
bool isMaskZeroExtended(SDNode *N) const;
bool tryShiftAmountMod(SDNode *N);
+ bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
const SDLoc &dl, MVT VT, SDNode *Node);
@@ -3441,6 +3442,347 @@ bool X86DAGToDAGISel::shrinkAndImmediate
return true;
}
+static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
+ bool FoldedBCast, bool Masked) {
+ if (Masked) {
+ if (FoldedLoad) {
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v16i8:
+ return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
+ case MVT::v8i16:
+ return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
+ case MVT::v32i8:
+ return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
+ case MVT::v16i16:
+ return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
+ case MVT::v64i8:
+ return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
+ case MVT::v32i16:
+ return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
+ }
+ }
+
+ if (FoldedBCast) {
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
+ }
+ }
+
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v16i8:
+ return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
+ case MVT::v8i16:
+ return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
+ case MVT::v32i8:
+ return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
+ case MVT::v16i16:
+ return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
+ case MVT::v64i8:
+ return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
+ case MVT::v32i16:
+ return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
+ }
+ }
+
+ if (FoldedLoad) {
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v16i8:
+ return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
+ case MVT::v8i16:
+ return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
+ case MVT::v32i8:
+ return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
+ case MVT::v16i16:
+ return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
+ case MVT::v64i8:
+ return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
+ case MVT::v32i16:
+ return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
+ }
+ }
+
+ if (FoldedBCast) {
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
+ }
+ }
+
+ switch (TestVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v16i8:
+ return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
+ case MVT::v8i16:
+ return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
+ case MVT::v4i32:
+ return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
+ case MVT::v2i64:
+ return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
+ case MVT::v32i8:
+ return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
+ case MVT::v16i16:
+ return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
+ case MVT::v8i32:
+ return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
+ case MVT::v4i64:
+ return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
+ case MVT::v64i8:
+ return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
+ case MVT::v32i16:
+ return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
+ case MVT::v16i32:
+ return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
+ case MVT::v8i64:
+ return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
+ }
+}
+
+// Try to create VPTESTM instruction. If InMask is not null, it will be used
+// to form a masked operation.
+bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
+ SDValue InMask) {
+ assert(Subtarget->hasAVX512() && "Expected AVX512!");
+ assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
+ "Unexpected VT!");
+
+ // Look for equal and not equal compares.
+ ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
+ if (CC != ISD::SETEQ && CC != ISD::SETNE)
+ return false;
+
+ // See if we're comparing against zero. This should have been canonicalized
+ // to RHS during lowering.
+ if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode()))
+ return false;
+
+ SDValue N0 = Setcc.getOperand(0);
+
+ MVT CmpVT = N0.getSimpleValueType();
+ MVT CmpSVT = CmpVT.getVectorElementType();
+
+ // Start with both operands the same. We'll try to refine this.
+ SDValue Src0 = N0;
+ SDValue Src1 = N0;
+
+ {
+ // Look through single use bitcasts.
+ SDValue N0Temp = N0;
+ if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse())
+ N0Temp = N0.getOperand(0);
+
+ // Look for single use AND.
+ if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) {
+ Src0 = N0Temp.getOperand(0);
+ Src1 = N0Temp.getOperand(1);
+ }
+ }
+
+ // Without VLX we need to widen the load.
+ bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector();
+
+ // We can only fold loads if the sources are unique.
+ bool CanFoldLoads = Src0 != Src1;
+
+ // Try to fold loads unless we need to widen.
+ bool FoldedLoad = false;
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
+ if (!Widen && CanFoldLoads) {
+ Load = Src1;
+ FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
+ Tmp4);
+ if (!FoldedLoad) {
+ // And is computative.
+ Load = Src0;
+ FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
+ Tmp3, Tmp4);
+ if (FoldedLoad)
+ std::swap(Src0, Src1);
+ }
+ }
+
+ auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
+ // Look through single use bitcasts.
+ if (Src.getOpcode() == ISD::BITCAST && Src.hasOneUse())
+ Src = Src.getOperand(0);
+
+ if (Src.getOpcode() == X86ISD::VBROADCAST && Src.hasOneUse()) {
+ Parent = Src.getNode();
+ Src = Src.getOperand(0);
+ if (Src.getSimpleValueType() == CmpSVT)
+ return Src;
+ }
+
+ return SDValue();
+ };
+
+ // If we didn't fold a load, try to match broadcast. No widening limitation
+ // for this. But only 32 and 64 bit types are supported.
+ bool FoldedBCast = false;
+ if (!FoldedLoad && CanFoldLoads &&
+ (CmpSVT == MVT::i32 || CmpSVT == MVT::i64)) {
+ SDNode *ParentNode;
+ if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
+ FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
+ Tmp1, Tmp2, Tmp3, Tmp4);
+ }
+
+ // Try the other operand.
+ if (!FoldedBCast) {
+ if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
+ FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
+ Tmp1, Tmp2, Tmp3, Tmp4);
+ if (FoldedBCast)
+ std::swap(Src0, Src1);
+ }
+ }
+ }
+
+ auto getMaskRC = [](MVT MaskVT) {
+ switch (MaskVT.SimpleTy) {
+ default: llvm_unreachable("Unexpected VT!");
+ case MVT::v2i1: return X86::VK2RegClassID;
+ case MVT::v4i1: return X86::VK4RegClassID;
+ case MVT::v8i1: return X86::VK8RegClassID;
+ case MVT::v16i1: return X86::VK16RegClassID;
+ case MVT::v32i1: return X86::VK32RegClassID;
+ case MVT::v64i1: return X86::VK64RegClassID;
+ }
+ };
+
+ bool IsMasked = InMask.getNode() != nullptr;
+
+ SDLoc dl(Root);
+
+ MVT ResVT = Setcc.getSimpleValueType();
+ MVT MaskVT = ResVT;
+ if (Widen) {
+ // Widen the inputs using insert_subreg or copy_to_regclass.
+ unsigned Scale = CmpVT.is128BitVector() ? 4 : 2;
+ unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm;
+ unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
+ CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
+ MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+ SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
+ CmpVT), 0);
+ Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
+
+ assert(!FoldedLoad && "Shouldn't have folded the load");
+ if (!FoldedBCast)
+ Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
+
+ if (IsMasked) {
+ // Widen the mask.
+ unsigned RegClass = getMaskRC(MaskVT);
+ SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
+ InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ dl, MaskVT, InMask, RC), 0);
+ }
+ }
+
+ bool IsTestN = CC == ISD::SETEQ;
+ unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
+ IsMasked);
+
+ MachineSDNode *CNode;
+ if (FoldedLoad || FoldedBCast) {
+ SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
+
+ if (IsMasked) {
+ SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
+ Load.getOperand(0) };
+ CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ } else {
+ SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
+ Load.getOperand(0) };
+ CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+ }
+
+ // Update the chain.
+ ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
+ // Record the mem-refs
+ CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
+ } else {
+ if (IsMasked)
+ CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
+ else
+ CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
+ }
+
+ // If we widened, we need to shrink the mask VT.
+ if (Widen) {
+ unsigned RegClass = getMaskRC(ResVT);
+ SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
+ CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+ dl, ResVT, SDValue(CNode, 0), RC);
+ }
+
+ ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
+ CurDAG->RemoveDeadNode(Root);
+ return true;
+}
+
void X86DAGToDAGISel::Select(SDNode *Node) {
MVT NVT = Node->getSimpleValueType(0);
unsigned Opcode = Node->getOpcode();
@@ -3570,6 +3912,18 @@ void X86DAGToDAGISel::Select(SDNode *Nod
break;
case ISD::AND:
+ if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
+ // Try to form a masked VPTESTM. Operands can be in either order.
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+ if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
+ tryVPTESTM(Node, N0, N1))
+ return;
+ if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
+ tryVPTESTM(Node, N1, N0))
+ return;
+ }
+
if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
CurDAG->RemoveDeadNode(Node);
@@ -4207,6 +4561,13 @@ void X86DAGToDAGISel::Select(SDNode *Nod
return;
}
+ case ISD::SETCC: {
+ if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
+ return;
+
+ break;
+ }
+
case ISD::STORE:
if (foldLoadStoreIntoMemOperand(Node))
return;
Modified: llvm/trunk/lib/Target/X86/X86InstrAVX512.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86InstrAVX512.td?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86InstrAVX512.td (original)
+++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td Sun Apr 14 11:26:11 2019
@@ -5813,309 +5813,93 @@ defm VSCALEF : avx512_fp_scalef_all<0x2C
// AVX-512 VPTESTM instructions
//===----------------------------------------------------------------------===//
-multiclass avx512_vptest<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- PatFrag OpNode_su,
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
string Name> {
- let ExeDomain = _.ExeDomain in {
+ // NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG.
+ // There are just too many permuations due to commutability and bitcasts.
+ let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
let isCommutable = 1 in
defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV),
- (OpNode_su (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)>,
+ (null_frag), (null_frag)>,
EVEX_4V, Sched<[sched]>;
+ let mayLoad = 1 in
defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (and _.RC:$src1, (_.LdFrag addr:$src2)),
- _.ImmAllZerosV),
- (OpNode_su (and _.RC:$src1, (_.LdFrag addr:$src2)),
- _.ImmAllZerosV)>,
+ (null_frag), (null_frag)>,
EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-
- // Patterns for compare with 0 that just use the same source twice.
- def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)),
- (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rr")
- _.RC:$src, _.RC:$src))>;
-
- def : Pat<(_.KVT (and _.KRC:$mask, (OpNode_su _.RC:$src, _.ImmAllZerosV))),
- (_.KVT (!cast<Instruction>(Name # _.ZSuffix # "rrk")
- _.KRC:$mask, _.RC:$src, _.RC:$src))>;
}
-multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- PatFrag OpNode_su, X86FoldableSchedWrite sched,
- X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in
+multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr,
+ X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in
defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
- (OpNode (and _.RC:$src1,
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))),
- _.ImmAllZerosV),
- (OpNode_su (and _.RC:$src1,
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))),
- _.ImmAllZerosV)>,
+ (null_frag), (null_frag)>,
EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-// Use 512bit version to implement 128/256 bit in case NoVLX.
-multiclass avx512_vptest_lowering<PatFrag OpNode, PatFrag OpNode_su,
- X86VectorVTInfo ExtendInfo, X86VectorVTInfo _,
- string Name> {
- def : Pat<(_.KVT (OpNode (and _.RC:$src1, _.RC:$src2),
- _.ImmAllZerosV)),
- (_.KVT (COPY_TO_REGCLASS
- (!cast<Instruction>(Name # "Zrr")
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src1, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src2, _.SubRegIdx)),
- _.KRC))>;
-
- def : Pat<(_.KVT (and _.KRC:$mask,
- (OpNode_su (and _.RC:$src1, _.RC:$src2),
- _.ImmAllZerosV))),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(Name # "Zrrk")
- (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src1, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src2, _.SubRegIdx)),
- _.KRC)>;
-
- def : Pat<(_.KVT (OpNode _.RC:$src, _.ImmAllZerosV)),
- (_.KVT (COPY_TO_REGCLASS
- (!cast<Instruction>(Name # "Zrr")
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src, _.SubRegIdx)),
- _.KRC))>;
-
- def : Pat<(_.KVT (and _.KRC:$mask, (OpNode_su _.RC:$src, _.ImmAllZerosV))),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(Name # "Zrrk")
- (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src, _.SubRegIdx)),
- _.KRC)>;
-}
-
-multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- PatFrag OpNode_su, X86SchedWriteWidths sched,
+multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched,
AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in
- defm Z : avx512_vptest<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM, _.info512, NAME>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, OpNode_su, sched.ZMM, _.info512>, EVEX_V512;
+ defm Z : avx512_vptest<opc, OpcodeStr, sched.ZMM, _.info512, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, sched.ZMM, _.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM, _.info256, NAME>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, OpNode_su, sched.YMM, _.info256>, EVEX_V256;
- defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM, _.info128, NAME>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, OpNode_su, sched.XMM, _.info128>, EVEX_V128;
- }
- let Predicates = [HasAVX512, NoVLX] in {
- defm Z256_Alt : avx512_vptest_lowering< OpNode, OpNode_su, _.info512, _.info256, NAME>;
- defm Z128_Alt : avx512_vptest_lowering< OpNode, OpNode_su, _.info512, _.info128, NAME>;
+ defm Z256 : avx512_vptest<opc, OpcodeStr, sched.YMM, _.info256, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, sched.YMM, _.info256>, EVEX_V256;
+ defm Z128 : avx512_vptest<opc, OpcodeStr, sched.XMM, _.info128, NAME>,
+ avx512_vptest_mb<opc, OpcodeStr, sched.XMM, _.info128>, EVEX_V128;
}
}
-multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, PatFrag OpNode,
- PatFrag OpNode_su, X86SchedWriteWidths sched> {
- defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, OpNode_su, sched,
+multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr,
+ X86SchedWriteWidths sched> {
+ defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", sched,
avx512vl_i32_info>;
- defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, OpNode_su, sched,
+ defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", sched,
avx512vl_i64_info>, VEX_W;
}
multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
- PatFrag OpNode, PatFrag OpNode_su,
X86SchedWriteWidths sched> {
let Predicates = [HasBWI] in {
- defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, OpNode_su, sched.ZMM,
+ defm WZ: avx512_vptest<opc, OpcodeStr#"w", sched.ZMM,
v32i16_info, NAME#"W">, EVEX_V512, VEX_W;
- defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, OpNode_su, sched.ZMM,
+ defm BZ: avx512_vptest<opc, OpcodeStr#"b", sched.ZMM,
v64i8_info, NAME#"B">, EVEX_V512;
}
let Predicates = [HasVLX, HasBWI] in {
- defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, OpNode_su, sched.YMM,
+ defm WZ256: avx512_vptest<opc, OpcodeStr#"w", sched.YMM,
v16i16x_info, NAME#"W">, EVEX_V256, VEX_W;
- defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, OpNode_su, sched.XMM,
+ defm WZ128: avx512_vptest<opc, OpcodeStr#"w", sched.XMM,
v8i16x_info, NAME#"W">, EVEX_V128, VEX_W;
- defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, OpNode_su, sched.YMM,
+ defm BZ256: avx512_vptest<opc, OpcodeStr#"b", sched.YMM,
v32i8x_info, NAME#"B">, EVEX_V256;
- defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, OpNode_su, sched.XMM,
+ defm BZ128: avx512_vptest<opc, OpcodeStr#"b", sched.XMM,
v16i8x_info, NAME#"B">, EVEX_V128;
}
-
- let Predicates = [HasBWI, NoVLX] in {
- defm BZ256_Alt : avx512_vptest_lowering<OpNode, OpNode_su, v64i8_info, v32i8x_info, NAME#"B">;
- defm BZ128_Alt : avx512_vptest_lowering<OpNode, OpNode_su, v64i8_info, v16i8x_info, NAME#"B">;
- defm WZ256_Alt : avx512_vptest_lowering<OpNode, OpNode_su, v32i16_info, v16i16x_info, NAME#"W">;
- defm WZ128_Alt : avx512_vptest_lowering<OpNode, OpNode_su, v32i16_info, v8i16x_info, NAME#"W">;
- }
}
-// These patterns are used to match vptestm/vptestnm. We don't treat pcmpeqm
-// as commutable here because we already canonicalized all zeros vectors to the
-// RHS during lowering.
-def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2),
- (setcc node:$src1, node:$src2, SETEQ)>;
-def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2),
- (setcc node:$src1, node:$src2, SETNE)>;
-
-def X86pcmpeqm_su : PatFrag<(ops node:$src1, node:$src2),
- (X86pcmpeqm node:$src1, node:$src2), [{
- return N->hasOneUse();
-}]>;
-def X86pcmpnem_su : PatFrag<(ops node:$src1, node:$src2),
- (X86pcmpnem node:$src1, node:$src2), [{
- return N->hasOneUse();
-}]>;
-
multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
- PatFrag OpNode, PatFrag OpNode_su,
X86SchedWriteWidths sched> :
- avx512_vptest_wb<opc_wb, OpcodeStr, OpNode, OpNode_su, sched>,
- avx512_vptest_dq<opc_dq, OpcodeStr, OpNode, OpNode_su, sched>;
-
-defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem,
- X86pcmpnem_su, SchedWriteVecLogic>, T8PD;
-defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm,
- X86pcmpeqm_su, SchedWriteVecLogic>, T8XS;
-
-
-multiclass avx512_vptest_lowering_pats<string InstrStr, PatFrag OpNode,
- PatFrag OpNode_su, X86VectorVTInfo _,
- X86VectorVTInfo AndInfo> {
- def : Pat<(_.KVT (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
- _.ImmAllZerosV)),
- (!cast<Instruction>(InstrStr # "rr") _.RC:$src1, _.RC:$src2)>;
-
- def : Pat<(_.KVT (and _.KRC:$mask,
- (OpNode_su (bitconvert
- (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
- _.ImmAllZerosV))),
- (!cast<Instruction>(InstrStr # "rrk") _.KRC:$mask, _.RC:$src1,
- _.RC:$src2)>;
-
- def : Pat<(_.KVT (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1,
- (AndInfo.LdFrag addr:$src2)))),
- _.ImmAllZerosV)),
- (!cast<Instruction>(InstrStr # "rm") _.RC:$src1, addr:$src2)>;
-
- def : Pat<(_.KVT (and _.KRC:$mask,
- (OpNode_su (bitconvert
- (AndInfo.VT (and _.RC:$src1,
- (AndInfo.LdFrag addr:$src2)))),
- _ .ImmAllZerosV))),
- (!cast<Instruction>(InstrStr # "rmk") _.KRC:$mask, _.RC:$src1,
- addr:$src2)>;
-}
-
-// Patterns to use 512-bit instructions when 128/256 are not available.
-multiclass avx512_vptest_lowering_wide_pats<string InstrStr, PatFrag OpNode,
- PatFrag OpNode_su,
- X86VectorVTInfo _,
- X86VectorVTInfo AndInfo,
- X86VectorVTInfo ExtendInfo> {
- def : Pat<(_.KVT (OpNode (bitconvert
- (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
- _.ImmAllZerosV)),
- (_.KVT (COPY_TO_REGCLASS
- (!cast<Instruction>(InstrStr#"rr")
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src1, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src2, _.SubRegIdx)),
- _.KRC))>;
-
- def : Pat<(_.KVT (and _.KRC:$mask,
- (OpNode_su (bitconvert
- (AndInfo.VT (and _.RC:$src1, _.RC:$src2))),
- _.ImmAllZerosV))),
- (COPY_TO_REGCLASS
- (!cast<Instruction>(InstrStr#"rrk")
- (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src1, _.SubRegIdx),
- (INSERT_SUBREG (ExtendInfo.VT (IMPLICIT_DEF)),
- _.RC:$src2, _.SubRegIdx)),
- _.KRC)>;
-}
-
-multiclass avx512_vptest_lowering_sizes<string InstrStr, PatFrag OpNode,
- PatFrag OpNode_su, Predicate prd,
- AVX512VLVectorVTInfo CmpInfo,
- AVX512VLVectorVTInfo AndInfo> {
-let Predicates = [prd, HasVLX] in {
- defm : avx512_vptest_lowering_pats<InstrStr#"Z128", OpNode, OpNode_su,
- CmpInfo.info128, AndInfo.info128>;
- defm : avx512_vptest_lowering_pats<InstrStr#"Z256", OpNode, OpNode_su,
- CmpInfo.info256, AndInfo.info256>;
-}
-let Predicates = [prd] in {
- defm : avx512_vptest_lowering_pats<InstrStr#"Z", OpNode, OpNode_su,
- CmpInfo.info512, AndInfo.info512>;
-}
-
-let Predicates = [prd, NoVLX] in {
- defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode, OpNode_su,
- CmpInfo.info128, AndInfo.info128,
- CmpInfo.info512>;
- defm : avx512_vptest_lowering_wide_pats<InstrStr#"Z", OpNode, OpNode_su,
- CmpInfo.info256, AndInfo.info256,
- CmpInfo.info512>;
-}
-}
-
-multiclass avx512_vptest_lowering_types<string InstrStr, PatFrag OpNode,
- PatFrag OpNode_su> {
- defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, OpNode_su, HasBWI,
- avx512vl_i8_info, avx512vl_i16_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, OpNode_su, HasBWI,
- avx512vl_i8_info, avx512vl_i32_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "B", OpNode, OpNode_su, HasBWI,
- avx512vl_i8_info, avx512vl_i64_info>;
-
- defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, OpNode_su, HasBWI,
- avx512vl_i16_info, avx512vl_i8_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, OpNode_su, HasBWI,
- avx512vl_i16_info, avx512vl_i32_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "W", OpNode, OpNode_su, HasBWI,
- avx512vl_i16_info, avx512vl_i64_info>;
-
- defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, OpNode_su, HasAVX512,
- avx512vl_i32_info, avx512vl_i8_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, OpNode_su, HasAVX512,
- avx512vl_i32_info, avx512vl_i16_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "D", OpNode, OpNode_su, HasAVX512,
- avx512vl_i32_info, avx512vl_i64_info>;
-
- defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, OpNode_su, HasAVX512,
- avx512vl_i64_info, avx512vl_i8_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, OpNode_su, HasAVX512,
- avx512vl_i64_info, avx512vl_i16_info>;
- defm : avx512_vptest_lowering_sizes<InstrStr # "Q", OpNode, OpNode_su, HasAVX512,
- avx512vl_i64_info, avx512vl_i32_info>;
-}
+ avx512_vptest_wb<opc_wb, OpcodeStr, sched>,
+ avx512_vptest_dq<opc_dq, OpcodeStr, sched>;
-defm : avx512_vptest_lowering_types<"VPTESTM", X86pcmpnem, X86pcmpnem_su>;
-defm : avx512_vptest_lowering_types<"VPTESTNM", X86pcmpeqm, X86pcmpeqm_su>;
+defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm",
+ SchedWriteVecLogic>, T8PD;
+defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm",
+ SchedWriteVecLogic>, T8XS;
//===----------------------------------------------------------------------===//
// AVX-512 Shift instructions
Modified: llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-vec-cmp.ll Sun Apr 14 11:26:11 2019
@@ -853,8 +853,8 @@ define <8 x double> @test43(<8 x double>
; KNL-NEXT: vpmovzxwq %xmm2, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x34,0xd2]
; KNL-NEXT: ## zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
; KNL-NEXT: vpsllq $63, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x73,0xf2,0x3f]
-; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 ## encoding: [0x62,0xf2,0xed,0x48,0x27,0xca]
-; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xc2,0x0f,0x01]
+; KNL-NEXT: vcmpltpd (%rdi){1to8}, %zmm0, %k1 ## encoding: [0x62,0xf1,0xfd,0x58,0xc2,0x0f,0x01]
+; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 {%k1} ## encoding: [0x62,0xf2,0xed,0x49,0x27,0xca]
; KNL-NEXT: vblendmpd %zmm0, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x65,0xc0]
; KNL-NEXT: retq ## encoding: [0xc3]
;
Modified: llvm/trunk/test/CodeGen/X86/kshift.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/kshift.ll?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/kshift.ll (original)
+++ llvm/trunk/test/CodeGen/X86/kshift.ll Sun Apr 14 11:26:11 2019
@@ -10,8 +10,8 @@ define i8 @kshiftl_v8i1_1(<8 x i64> %x,
; KNL-NEXT: movb $-2, %al
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -75,15 +75,15 @@ define i32 @kshiftl_v32i1_1(<32 x i16> %
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1
-; KNL-NEXT: kshiftlw $1, %k2, %k2
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: kshiftlw $1, %k2, %k1
+; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2
; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k2}
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %ecx
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k1}
+; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %ecx, %eax
@@ -112,38 +112,38 @@ define i64 @kshiftl_v64i1_1(<64 x i8> %x
; KNL-NEXT: vpxor %xmm4, %xmm4, %xmm4
; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm5
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k3
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm0
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k3
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
; KNL-NEXT: valignd {{.*#+}} zmm1 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm1[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[15],zmm1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4
-; KNL-NEXT: kshiftlw $1, %k3, %k3
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm0
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm5[15],zmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm6[15],zmm5[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
+; KNL-NEXT: kshiftlw $1, %k1, %k3
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm3
+; KNL-NEXT: vextracti128 $1, %ymm3, %xmm6
+; KNL-NEXT: vpmovsxbd %xmm6, %zmm6
+; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1
+; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k2
; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm3
; KNL-NEXT: vpmovsxbd %xmm3, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k4
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 {%k3}
; KNL-NEXT: kmovw %k0, %eax
-; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 {%k4}
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k0 {%k4}
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: orl %eax, %ecx
@@ -182,8 +182,8 @@ define i8 @kshiftl_v8i1_7(<8 x i64> %x,
; KNL-NEXT: movb $-128, %al
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -308,8 +308,8 @@ define i8 @kshiftr_v8i1_1(<8 x i64> %x,
; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,2,3,4,5,6,7,15]
; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm3
-; KNL-NEXT: vptestmq %zmm3, %zmm3, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm3, %zmm3, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -373,10 +373,10 @@ define i32 @kshiftr_v32i1_1(<32 x i16> %
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0]
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
; KNL-NEXT: kshiftrw $1, %k1, %k1
-; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm0
-; KNL-NEXT: vpmovsxwd %ymm0, %zmm0
+; KNL-NEXT: vpcmpeqw %ymm4, %ymm2, %ymm1
+; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k2
; KNL-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm1
; KNL-NEXT: vpmovsxwd %ymm1, %zmm1
; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1}
@@ -411,44 +411,44 @@ define i64 @kshiftr_v64i1_1(<64 x i8> %x
; KNL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm1
; KNL-NEXT: vextracti128 $1, %ymm1, %xmm5
; KNL-NEXT: vpmovsxbd %xmm5, %zmm5
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k3
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1
; KNL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm5
; KNL-NEXT: vptestmd %zmm5, %zmm5, %k2
; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k3
; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0
; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4
; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k4} {z}
-; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z}
; KNL-NEXT: valignd {{.*#+}} zmm5 = zmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0]
-; KNL-NEXT: vptestmd %zmm5, %zmm5, %k1
-; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k2} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm5[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k2
-; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z}
-; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
-; KNL-NEXT: vptestmd %zmm0, %zmm0, %k4
-; KNL-NEXT: kshiftrw $1, %k3, %k3
-; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm0
-; KNL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT: vpmovsxbd %xmm1, %zmm1
-; KNL-NEXT: vpmovsxbd %xmm0, %zmm0
+; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm0 = zmm6[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0]
+; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z}
+; KNL-NEXT: valignd {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm6[0]
+; KNL-NEXT: kshiftrw $1, %k1, %k3
+; KNL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2
+; KNL-NEXT: vextracti128 $1, %ymm2, %xmm6
+; KNL-NEXT: vpmovsxbd %xmm6, %zmm6
+; KNL-NEXT: vptestmd %zmm6, %zmm6, %k1
+; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
+; KNL-NEXT: vptestmd %zmm2, %zmm2, %k2
; KNL-NEXT: vpcmpeqb %ymm4, %ymm3, %ymm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm3
+; KNL-NEXT: vptestmd %zmm3, %zmm3, %k4
; KNL-NEXT: vextracti128 $1, %ymm2, %xmm2
; KNL-NEXT: vpmovsxbd %xmm2, %zmm2
; KNL-NEXT: vptestmd %zmm2, %zmm2, %k0 {%k3}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
-; KNL-NEXT: vptestmd %zmm3, %zmm3, %k0 {%k4}
+; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k4}
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: orl %eax, %ecx
; KNL-NEXT: shlq $32, %rcx
; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 {%k2}
; KNL-NEXT: kmovw %k0, %edx
-; KNL-NEXT: vptestmd %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestmd %zmm5, %zmm5, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: shll $16, %eax
; KNL-NEXT: orl %edx, %eax
@@ -480,8 +480,8 @@ define i8 @kshiftr_v8i1_7(<8 x i64> %x,
; KNL-NEXT: movb $-2, %al
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -605,8 +605,8 @@ define i8 @kshiftl_v8i1_zu123u56(<8 x i6
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = <8,u,1,2,3,u,5,6>
; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm3
; KNL-NEXT: vpsllq $63, %zmm3, %zmm0
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -636,8 +636,8 @@ define i8 @kshiftl_v8i1_u0123456(<8 x i6
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: valignq {{.*#+}} zmm0 = zmm0[7,0,1,2,3,4,5,6]
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -669,8 +669,8 @@ define i8 @kshiftr_v8i1_1u3u567z(<8 x i6
; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = <1,u,3,u,5,6,7,15>
; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm3
; KNL-NEXT: vpsllq $63, %zmm3, %zmm0
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@@ -700,8 +700,8 @@ define i8 @kshiftr_v8i1_234567uu(<8 x i6
; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; KNL-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,4,5,6,7,0,1]
; KNL-NEXT: vpsllq $63, %zmm0, %zmm0
-; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k0 {%k1}
+; KNL-NEXT: vptestnmq %zmm1, %zmm1, %k1
+; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 {%k1}
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: # kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
Modified: llvm/trunk/test/CodeGen/X86/movmsk-cmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/movmsk-cmp.ll?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/movmsk-cmp.ll (original)
+++ llvm/trunk/test/CodeGen/X86/movmsk-cmp.ll Sun Apr 14 11:26:11 2019
@@ -2087,8 +2087,7 @@ define i1 @allones_v4i32_and1(<4 x i32>
; KNL-LABEL: allones_v4i32_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $15, %al
; KNL-NEXT: cmpb $15, %al
@@ -2131,8 +2130,7 @@ define i1 @allzeros_v4i32_and1(<4 x i32>
; KNL-LABEL: allzeros_v4i32_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $15, %al
; KNL-NEXT: sete %al
@@ -2192,8 +2190,7 @@ define i1 @allones_v8i32_and1(<8 x i32>
; KNL-LABEL: allones_v8i32_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: cmpb $-1, %al
; KNL-NEXT: sete %al
@@ -2253,8 +2250,7 @@ define i1 @allzeros_v8i32_and1(<8 x i32>
; KNL-LABEL: allzeros_v8i32_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb %al, %al
; KNL-NEXT: sete %al
@@ -2573,8 +2569,7 @@ define i1 @allones_v4i64_and1(<4 x i64>
; KNL-LABEL: allones_v4i64_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
-; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $15, %al
; KNL-NEXT: cmpb $15, %al
@@ -2640,8 +2635,7 @@ define i1 @allzeros_v4i64_and1(<4 x i64>
; KNL-LABEL: allzeros_v4i64_and1:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1]
-; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $15, %al
; KNL-NEXT: sete %al
@@ -3686,8 +3680,7 @@ define i1 @allones_v4i32_and4(<4 x i32>
; KNL-LABEL: allones_v4i32_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $15, %al
; KNL-NEXT: cmpb $15, %al
@@ -3730,8 +3723,7 @@ define i1 @allzeros_v4i32_and4(<4 x i32>
; KNL-LABEL: allzeros_v4i32_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $15, %al
; KNL-NEXT: sete %al
@@ -3791,8 +3783,7 @@ define i1 @allones_v8i32_and4(<8 x i32>
; KNL-LABEL: allones_v8i32_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: cmpb $-1, %al
; KNL-NEXT: sete %al
@@ -3852,8 +3843,7 @@ define i1 @allzeros_v8i32_and4(<8 x i32>
; KNL-LABEL: allzeros_v8i32_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4]
-; KNL-NEXT: vptestmd %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmd {{.*}}(%rip){1to16}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb %al, %al
; KNL-NEXT: sete %al
@@ -4172,8 +4162,7 @@ define i1 @allones_v4i64_and4(<4 x i64>
; KNL-LABEL: allones_v4i64_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4]
-; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: andb $15, %al
; KNL-NEXT: cmpb $15, %al
@@ -4239,8 +4228,7 @@ define i1 @allzeros_v4i64_and4(<4 x i64>
; KNL-LABEL: allzeros_v4i64_and4:
; KNL: # %bb.0:
; KNL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4]
-; KNL-NEXT: vptestmq %zmm1, %zmm0, %k0
+; KNL-NEXT: vptestmq {{.*}}(%rip){1to8}, %zmm0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: testb $15, %al
; KNL-NEXT: sete %al
Modified: llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll (original)
+++ llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-extend.ll Sun Apr 14 11:26:11 2019
@@ -7,8 +7,8 @@
define <8 x i16> @testv8i1_sext_v8i16(<8 x i32>* %p) {
; AVX256-LABEL: testv8i1_sext_v8i16:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
@@ -17,8 +17,8 @@ define <8 x i16> @testv8i1_sext_v8i16(<8
;
; AVX512VL-LABEL: testv8i1_sext_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
@@ -42,9 +42,10 @@ define <8 x i16> @testv8i1_sext_v8i16(<8
define <16 x i8> @testv16i1_sext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
; AVX256-LABEL: testv16i1_sext_v16i8:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rsi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
@@ -56,9 +57,10 @@ define <16 x i8> @testv16i1_sext_v16i8(<
;
; AVX512VL-LABEL: testv16i1_sext_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
@@ -88,9 +90,10 @@ define <16 x i8> @testv16i1_sext_v16i8(<
define <16 x i16> @testv16i1_sext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
; AVX256-LABEL: testv16i1_sext_v16i16:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rsi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
@@ -101,9 +104,10 @@ define <16 x i16> @testv16i1_sext_v16i16
;
; AVX512VL-LABEL: testv16i1_sext_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
@@ -131,8 +135,8 @@ define <16 x i16> @testv16i1_sext_v16i16
define <8 x i16> @testv8i1_zext_v8i16(<8 x i32>* %p) {
; AVX256-LABEL: testv8i1_zext_v8i16:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm0, %xmm0
@@ -142,8 +146,8 @@ define <8 x i16> @testv8i1_zext_v8i16(<8
;
; AVX512VL-LABEL: testv8i1_zext_v8i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
@@ -168,9 +172,10 @@ define <8 x i16> @testv8i1_zext_v8i16(<8
define <16 x i8> @testv16i1_zext_v16i8(<8 x i32>* %p, <8 x i32>* %q) {
; AVX256-LABEL: testv16i1_zext_v16i8:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rsi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
@@ -184,9 +189,10 @@ define <16 x i8> @testv16i1_zext_v16i8(<
;
; AVX512VL-LABEL: testv16i1_zext_v16i8:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
; AVX512VL-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
@@ -216,9 +222,10 @@ define <16 x i8> @testv16i1_zext_v16i8(<
define <16 x i16> @testv16i1_zext_v16i16(<8 x i32>* %p, <8 x i32>* %q) {
; AVX256-LABEL: testv16i1_zext_v16i16:
; AVX256: # %bb.0:
-; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256-NEXT: vmovdqa (%rsi), %ymm0
+; AVX256-NEXT: vptestnmd %ymm0, %ymm0, %k2
; AVX256-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z}
; AVX256-NEXT: vpmovdw %ymm1, %xmm1
@@ -230,9 +237,10 @@ define <16 x i16> @testv16i1_zext_v16i16
;
; AVX512VL-LABEL: testv16i1_zext_v16i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0
Modified: llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll (original)
+++ llvm/trunk/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll Sun Apr 14 11:26:11 2019
@@ -11,9 +11,10 @@
define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<8 x i32>* %a, <8 x i32>* %b) {
; AVX256VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX256VL: # %bb.0:
-; AVX256VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX256VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX256VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256VL-NEXT: vmovdqa (%rsi), %ymm1
+; AVX256VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX256VL-NEXT: vptestnmd %ymm1, %ymm1, %k2
; AVX256VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
; AVX256VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k2} {z}
; AVX256VL-NEXT: vpmovdw %ymm1, %xmm1
@@ -42,9 +43,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7
;
; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX512VL-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VL-NEXT: vmovdqa (%rsi), %ymm1
+; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX512VL-NEXT: vptestnmd %ymm1, %ymm1, %k2
; AVX512VL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -57,9 +59,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7
;
; AVX256VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX256VLBW: # %bb.0:
-; AVX256VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX256VLBW-NEXT: vpcmpeqd (%rdi), %ymm0, %k0
-; AVX256VLBW-NEXT: vpcmpeqd (%rsi), %ymm0, %k1
+; AVX256VLBW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX256VLBW-NEXT: vmovdqa (%rsi), %ymm1
+; AVX256VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k0
+; AVX256VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k1
; AVX256VLBW-NEXT: vpmovm2w %k1, %ymm0
; AVX256VLBW-NEXT: vpmovm2w %k0, %ymm1
; AVX256VLBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
@@ -71,9 +74,10 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7
;
; AVX512VLBW-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpxor %xmm0, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpcmpeqd (%rdi), %ymm0, %k1
-; AVX512VLBW-NEXT: vpcmpeqd (%rsi), %ymm0, %k2
+; AVX512VLBW-NEXT: vmovdqa (%rdi), %ymm0
+; AVX512VLBW-NEXT: vmovdqa (%rsi), %ymm1
+; AVX512VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k1
+; AVX512VLBW-NEXT: vptestnmd %ymm1, %ymm1, %k2
; AVX512VLBW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
; AVX512VLBW-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
Modified: llvm/trunk/test/CodeGen/X86/setcc-lowering.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/setcc-lowering.ll?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/setcc-lowering.ll (original)
+++ llvm/trunk/test/CodeGen/X86/setcc-lowering.ll Sun Apr 14 11:26:11 2019
@@ -24,8 +24,7 @@ define <8 x i16> @pr25080(<8 x i32> %a)
; KNL-32-LABEL: pr25080:
; KNL-32: # %bb.0: # %entry
; KNL-32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607]
-; KNL-32-NEXT: vptestnmd %zmm1, %zmm0, %k0
+; KNL-32-NEXT: vptestnmd {{\.LCPI.*}}{1to16}, %zmm0, %k0
; KNL-32-NEXT: movb $15, %al
; KNL-32-NEXT: kmovw %eax, %k1
; KNL-32-NEXT: korw %k1, %k0, %k1
Modified: llvm/trunk/test/CodeGen/X86/vector-fshl-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-fshl-128.ll?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-fshl-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-fshl-128.ll Sun Apr 14 11:26:11 2019
@@ -108,16 +108,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x
;
; AVX512F-LABEL: var_funnnel_v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -125,30 +123,27 @@ define <2 x i64> @var_funnnel_v2i64(<2 x
;
; AVX512VL-LABEL: var_funnnel_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -156,16 +151,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x
;
; AVX512VBMI2-LABEL: var_funnnel_v2i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -173,14 +166,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x
;
; AVX512VLBW-LABEL: var_funnnel_v2i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsllvq %xmm4, %xmm0, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllvq %xmm2, %xmm0, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
@@ -358,16 +350,15 @@ define <4 x i32> @var_funnnel_v4i32(<4 x
;
; AVX512F-LABEL: var_funnnel_v4i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -375,30 +366,28 @@ define <4 x i32> @var_funnnel_v4i32(<4 x
;
; AVX512VL-LABEL: var_funnnel_v4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VL-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v4i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -406,16 +395,15 @@ define <4 x i32> @var_funnnel_v4i32(<4 x
;
; AVX512VBMI2-LABEL: var_funnnel_v4i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -423,14 +411,13 @@ define <4 x i32> @var_funnnel_v4i32(<4 x
;
; AVX512VLBW-LABEL: var_funnnel_v4i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsllvd %xmm4, %xmm0, %xmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllvd %xmm2, %xmm0, %xmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
@@ -719,17 +706,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x
;
; AVX512BW-LABEL: var_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -737,17 +722,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x
;
; AVX512VBMI2-LABEL: var_funnnel_v8i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -755,14 +738,13 @@ define <8 x i16> @var_funnnel_v8i16(<8 x
;
; AVX512VLBW-LABEL: var_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsllvw %xmm4, %xmm0, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllvw %xmm2, %xmm0, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
@@ -1040,21 +1022,19 @@ define <16 x i8> @var_funnnel_v16i8(<16
;
; AVX512BW-LABEL: var_funnnel_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -1062,21 +1042,19 @@ define <16 x i8> @var_funnnel_v16i8(<16
;
; AVX512VBMI2-LABEL: var_funnnel_v16i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1084,19 +1062,18 @@ define <16 x i8> @var_funnnel_v16i8(<16
;
; AVX512VLBW-LABEL: var_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT: vpsllvw %ymm5, %ymm6, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: vzeroupper
@@ -1104,19 +1081,18 @@ define <16 x i8> @var_funnnel_v16i8(<16
;
; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLVBMI2-NEXT: vpsllvw %ymm5, %ymm6, %ymm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLVBMI2-NEXT: vzeroupper
@@ -1276,14 +1252,13 @@ define <2 x i64> @splatvar_funnnel_v2i64
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpsllq %xmm2, %xmm0, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -1292,14 +1267,13 @@ define <2 x i64> @splatvar_funnnel_v2i64
; AVX512VL-LABEL: splatvar_funnnel_v2i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpsllq %xmm2, %xmm0, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VL-NEXT: retq
@@ -1308,14 +1282,13 @@ define <2 x i64> @splatvar_funnnel_v2i64
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsllq %xmm2, %xmm0, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -1325,14 +1298,13 @@ define <2 x i64> @splatvar_funnnel_v2i64
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsllq %xmm4, %xmm0, %xmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsllq %xmm2, %xmm0, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1341,14 +1313,13 @@ define <2 x i64> @splatvar_funnnel_v2i64
; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsllq %xmm2, %xmm0, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
@@ -1497,15 +1468,15 @@ define <4 x i32> @splatvar_funnnel_v4i32
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpslld %xmm5, %xmm0, %xmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpslld %xmm3, %xmm0, %xmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512F-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512F-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
@@ -1514,16 +1485,15 @@ define <4 x i32> @splatvar_funnnel_v4i32
; AVX512VL-LABEL: splatvar_funnnel_v4i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpslld %xmm5, %xmm0, %xmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpslld %xmm3, %xmm0, %xmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VL-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VL-NEXT: retq
@@ -1533,15 +1503,15 @@ define <4 x i32> @splatvar_funnnel_v4i32
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpslld %xmm5, %xmm0, %xmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpslld %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512BW-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -1552,15 +1522,15 @@ define <4 x i32> @splatvar_funnnel_v4i32
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT: vpslld %xmm5, %xmm0, %xmm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT: vpslld %xmm3, %xmm0, %xmm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VBMI2-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1569,16 +1539,15 @@ define <4 x i32> @splatvar_funnnel_v4i32
; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpslld %xmm5, %xmm0, %xmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpslld %xmm3, %xmm0, %xmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VLBW-NEXT: vpsrld %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
@@ -1759,16 +1728,15 @@ define <8 x i16> @splatvar_funnnel_v8i16
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm5, %xmm0, %xmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512BW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -1778,16 +1746,15 @@ define <8 x i16> @splatvar_funnnel_v8i16
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm5, %xmm0, %xmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm0, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1796,16 +1763,15 @@ define <8 x i16> @splatvar_funnnel_v8i16
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm5, %xmm0, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm0, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm1, %xmm1
-; AVX512VLBW-NEXT: vpor %xmm1, %xmm5, %xmm1
-; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm1, %xmm3, %xmm1
+; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: retq
@@ -2031,18 +1997,17 @@ define <16 x i8> @splatvar_funnnel_v16i8
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512BW-NEXT: vzeroupper
@@ -2053,18 +2018,17 @@ define <16 x i8> @splatvar_funnnel_v16i8
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm6, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -2073,19 +2037,18 @@ define <16 x i8> @splatvar_funnnel_v16i8
; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLBW-NEXT: vpsllvw %ymm5, %ymm6, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VLBW-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLBW-NEXT: vzeroupper
@@ -2094,19 +2057,18 @@ define <16 x i8> @splatvar_funnnel_v16i8
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VLVBMI2-NEXT: vpsllvw %ymm5, %ymm6, %ymm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
+; AVX512VLVBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX512VLVBMI2-NEXT: vpmovwb %ymm1, %xmm1
-; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0
; AVX512VLVBMI2-NEXT: vzeroupper
Modified: llvm/trunk/test/CodeGen/X86/vector-fshl-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-fshl-256.ll?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-fshl-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-fshl-256.ll Sun Apr 14 11:26:11 2019
@@ -71,76 +71,71 @@ define <4 x i64> @var_funnnel_v4i64(<4 x
;
; AVX512F-LABEL: var_funnnel_v4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512F-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512F-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VL-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VL-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v4i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512BW-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512BW-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v4i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VBMI2-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VBMI2-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v4i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsllvq %ymm4, %ymm0, %ymm5
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VLBW-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllvq %ymm2, %ymm0, %ymm3
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VLBW-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -257,76 +252,71 @@ define <8 x i32> @var_funnnel_v8i32(<8 x
;
; AVX512F-LABEL: var_funnnel_v8i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v8i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v8i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsllvd %ymm4, %ymm0, %ymm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllvd %ymm2, %ymm0, %ymm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -514,48 +504,43 @@ define <16 x i16> @var_funnnel_v16i16(<1
;
; AVX512BW-LABEL: var_funnnel_v16i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v16i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllvw %ymm2, %ymm0, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -792,80 +777,74 @@ define <32 x i8> @var_funnnel_v32i8(<32
;
; AVX512BW-LABEL: var_funnnel_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VLBW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLVBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLVBMI2-NEXT: retq
@@ -970,13 +949,13 @@ define <4 x i64> @splatvar_funnnel_v4i64
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpsllq %xmm4, %ymm0, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsllq %xmm2, %ymm0, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: retq
@@ -984,14 +963,13 @@ define <4 x i64> @splatvar_funnnel_v4i64
; AVX512VL-LABEL: splatvar_funnnel_v4i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsllq %xmm4, %ymm0, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsllq %xmm2, %ymm0, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: retq
@@ -1001,13 +979,13 @@ define <4 x i64> @splatvar_funnnel_v4i64
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsllq %xmm4, %ymm0, %ymm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsllq %xmm2, %ymm0, %ymm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
@@ -1017,13 +995,13 @@ define <4 x i64> @splatvar_funnnel_v4i64
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsllq %xmm4, %ymm0, %ymm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsllq %xmm2, %ymm0, %ymm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
@@ -1031,14 +1009,13 @@ define <4 x i64> @splatvar_funnnel_v4i64
; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsllq %xmm4, %ymm0, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsllq %xmm2, %ymm0, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -1144,15 +1121,15 @@ define <8 x i32> @splatvar_funnnel_v8i32
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpslld %xmm5, %ymm0, %ymm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpslld %xmm3, %ymm0, %ymm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512F-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512F-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa %ymm1, %ymm0
; AVX512F-NEXT: retq
@@ -1160,16 +1137,15 @@ define <8 x i32> @splatvar_funnnel_v8i32
; AVX512VL-LABEL: splatvar_funnnel_v8i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpslld %xmm5, %ymm0, %ymm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpslld %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VL-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VL-NEXT: retq
@@ -1179,15 +1155,15 @@ define <8 x i32> @splatvar_funnnel_v8i32
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpslld %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpslld %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512BW-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
@@ -1197,15 +1173,15 @@ define <8 x i32> @splatvar_funnnel_v8i32
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT: vpslld %xmm5, %ymm0, %ymm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT: vpslld %xmm3, %ymm0, %ymm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VBMI2-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
@@ -1213,16 +1189,15 @@ define <8 x i32> @splatvar_funnnel_v8i32
; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpslld %xmm5, %ymm0, %ymm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpslld %xmm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VLBW-NEXT: vpsrld %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -1366,16 +1341,15 @@ define <16 x i16> @splatvar_funnnel_v16i
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm5, %ymm0, %ymm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512BW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
@@ -1384,16 +1358,15 @@ define <16 x i16> @splatvar_funnnel_v16i
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm5, %ymm0, %ymm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VBMI2-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VBMI2-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
@@ -1401,16 +1374,15 @@ define <16 x i16> @splatvar_funnnel_v16i
; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm5, %ymm0, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm3, %ymm0, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm4, %ymm1, %ymm1
-; AVX512VLBW-NEXT: vpor %ymm1, %ymm5, %ymm1
-; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm1, %ymm3, %ymm1
+; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -1589,19 +1561,18 @@ define <32 x i8> @splatvar_funnnel_v32i8
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512BW-NEXT: retq
@@ -1610,19 +1581,18 @@ define <32 x i8> @splatvar_funnnel_v32i8
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VBMI2-NEXT: retq
@@ -1630,19 +1600,18 @@ define <32 x i8> @splatvar_funnnel_v32i8
; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLBW-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VLBW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLBW-NEXT: retq
@@ -1650,19 +1619,18 @@ define <32 x i8> @splatvar_funnnel_v32i8
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512VLVBMI2-NEXT: vpsllvw %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512VLVBMI2-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa %ymm1, %ymm0
; AVX512VLVBMI2-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-fshl-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-fshl-512.ll?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-fshl-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-fshl-512.ll Sun Apr 14 11:26:11 2019
@@ -18,42 +18,39 @@ declare <64 x i8> @llvm.fshl.v64i8(<64 x
define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v8i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpsllvq %zmm4, %zmm0, %zmm5
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512F-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512F-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpsllvq %zmm4, %zmm0, %zmm5
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512VL-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512VL-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsllvq %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512BW-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512BW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -65,14 +62,13 @@ define <8 x i64> @var_funnnel_v8i64(<8 x
;
; AVX512VLBW-LABEL: var_funnnel_v8i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsllvq %zmm4, %zmm0, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512VLBW-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllvq %zmm2, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512VLBW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -88,42 +84,39 @@ define <8 x i64> @var_funnnel_v8i64(<8 x
define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v16i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v16i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsllvd %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -135,14 +128,13 @@ define <16 x i32> @var_funnnel_v16i32(<1
;
; AVX512VLBW-LABEL: var_funnnel_v16i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsllvd %zmm4, %zmm0, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllvd %zmm2, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -220,14 +212,13 @@ define <32 x i16> @var_funnnel_v32i16(<3
;
; AVX512BW-LABEL: var_funnnel_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -239,14 +230,13 @@ define <32 x i16> @var_funnnel_v32i16(<3
;
; AVX512VLBW-LABEL: var_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm5
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllvw %zmm2, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -398,160 +388,156 @@ define <64 x i8> @var_funnnel_v64i8(<64
;
; AVX512BW-LABEL: var_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %zmm4, %zmm5, %zmm5
-; AVX512BW-NEXT: vpsllw $5, %zmm5, %zmm5
-; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512BW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512BW-NEXT: vpmovb2m %zmm5, %k2
-; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm5
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %zmm2, %zmm3, %zmm3
+; AVX512BW-NEXT: vpsllw $5, %zmm3, %zmm3
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vpmovb2m %zmm3, %k2
+; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
+; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm3
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm3
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512BW-NEXT: vpsllw $2, %zmm3, %zmm5
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k2}
-; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm5
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512BW-NEXT: vpsrlw $1, %zmm1, %zmm5
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512BW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512BW-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm5
-; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
-; AVX512BW-NEXT: vpmovb2m %zmm4, %k2
-; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm4
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512BW-NEXT: vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
-; AVX512BW-NEXT: vpsllw $2, %zmm4, %zmm6
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
-; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1}
-; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm5
-; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
-; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 {%k1}
-; AVX512BW-NEXT: vporq %zmm1, %zmm4, %zmm1
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %zmm4, %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpsllw $5, %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k2
-; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm5
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k2}
-; AVX512VBMI2-NEXT: vpsrlw $2, %zmm1, %zmm5
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm5
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %zmm2, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vpmovb2m %zmm3, %k2
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
+; AVX512VBMI2-NEXT: vpsrlw $2, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $1, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5
-; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1
-; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k2
-; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm4
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VBMI2-NEXT: vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
-; AVX512VBMI2-NEXT: vpsllw $2, %zmm4, %zmm6
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1}
-; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1
-; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 {%k1}
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm4, %zmm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %zmm4, %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpsllw $5, %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k2
-; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm5
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %zmm2, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpsllw $5, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vpmovb2m %zmm3, %k2
+; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
+; AVX512VLBW-NEXT: vpsrlw $2, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vpsllw $2, %zmm3, %zmm5
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k2}
-; AVX512VLBW-NEXT: vpsrlw $2, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vpsrlw $1, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VLBW-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm5
-; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1
-; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k2
-; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm4
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
-; AVX512VLBW-NEXT: vpsllw $2, %zmm4, %zmm6
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
-; AVX512VLBW-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1}
-; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1
-; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4 {%k1}
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm4, %zmm1
-; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %zmm4, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k2
-; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm5
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k2}
-; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm1, %zmm5
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm5
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %zmm2, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm3, %k2
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k2}
+; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm3, %zmm1 {%k1}
+; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm0, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm3, %zmm5
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm1 {%k1}
-; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k2
-; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm4
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vpblendmb %zmm4, %zmm0, %zmm4 {%k2}
-; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm4, %zmm6
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm6, %zmm6
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1}
-; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1
-; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4 {%k1}
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm4, %zmm1
-; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm3, %zmm3, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLVBMI2-NEXT: retq
@@ -567,14 +553,13 @@ define <8 x i64> @splatvar_funnnel_v8i64
; AVX512F-LABEL: splatvar_funnnel_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpsllq %xmm4, %zmm0, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
@@ -582,14 +567,13 @@ define <8 x i64> @splatvar_funnnel_v8i64
; AVX512VL-LABEL: splatvar_funnnel_v8i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpsllq %xmm4, %zmm0, %zmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsllq %xmm2, %zmm0, %zmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VL-NEXT: retq
@@ -597,14 +581,13 @@ define <8 x i64> @splatvar_funnnel_v8i64
; AVX512BW-LABEL: splatvar_funnnel_v8i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsllq %xmm4, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllq %xmm2, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -618,14 +601,13 @@ define <8 x i64> @splatvar_funnnel_v8i64
; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsllq %xmm4, %zmm0, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllq %xmm2, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -644,16 +626,15 @@ define <16 x i32> @splatvar_funnnel_v16i
; AVX512F-LABEL: splatvar_funnnel_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpslld %xmm5, %zmm0, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpslld %xmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512F-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512F-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512F-NEXT: retq
@@ -661,16 +642,15 @@ define <16 x i32> @splatvar_funnnel_v16i
; AVX512VL-LABEL: splatvar_funnnel_v16i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpslld %xmm5, %zmm0, %zmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpslld %xmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VL-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VL-NEXT: retq
@@ -678,16 +658,15 @@ define <16 x i32> @splatvar_funnnel_v16i
; AVX512BW-LABEL: splatvar_funnnel_v16i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpslld %xmm5, %zmm0, %zmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpslld %xmm3, %zmm0, %zmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512BW-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -701,16 +680,15 @@ define <16 x i32> @splatvar_funnnel_v16i
; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpslld %xmm5, %zmm0, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpslld %xmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VLBW-NEXT: vpsrld %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpord %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vpord %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -769,16 +747,15 @@ define <32 x i16> @splatvar_funnnel_v32i
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm5, %zmm0, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -792,16 +769,15 @@ define <32 x i16> @splatvar_funnnel_v32i
; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm5, %zmm0, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -880,24 +856,23 @@ define <64 x i8> @splatvar_funnnel_v64i8
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsllw %xmm5, %zmm0, %zmm6
-; AVX512BW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512BW-NEXT: vpsllw %xmm5, %zmm7, %zmm5
-; AVX512BW-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512BW-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
+; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512BW-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512BW-NEXT: vpsrlw %xmm4, %zmm5, %zmm4
; AVX512BW-NEXT: vpsrlw $8, %zmm4, %zmm4
; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512BW-NEXT: retq
@@ -905,24 +880,23 @@ define <64 x i8> @splatvar_funnnel_v64i8
; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsllw %xmm5, %zmm0, %zmm6
-; AVX512VBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VBMI2-NEXT: vpsllw %xmm5, %zmm7, %zmm5
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512VBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
+; AVX512VBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm5, %zmm4
; AVX512VBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4
; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1
-; AVX512VBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VBMI2-NEXT: retq
@@ -930,24 +904,23 @@ define <64 x i8> @splatvar_funnnel_v64i8
; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsllw %xmm5, %zmm0, %zmm6
-; AVX512VLBW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VLBW-NEXT: vpsllw %xmm5, %zmm7, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512VLBW-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm4
+; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm5, %zmm4
; AVX512VLBW-NEXT: vpsrlw $8, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VLBW-NEXT: vpandq %zmm4, %zmm1, %zmm1
-; AVX512VLBW-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLBW-NEXT: retq
@@ -955,24 +928,23 @@ define <64 x i8> @splatvar_funnnel_v64i8
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsllw %xmm5, %zmm0, %zmm6
-; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VLVBMI2-NEXT: vpsllw %xmm5, %zmm7, %zmm5
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm4
+; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm5, %zmm3
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm7, %zmm4
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm5, %zmm4
; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1
-; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm5, %zmm1
-; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1}
; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0
; AVX512VLVBMI2-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-fshr-128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-fshr-128.ll?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-fshr-128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-fshr-128.ll Sun Apr 14 11:26:11 2019
@@ -110,16 +110,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x
;
; AVX512F-LABEL: var_funnnel_v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -127,29 +125,26 @@ define <2 x i64> @var_funnnel_v2i64(<2 x
;
; AVX512VL-LABEL: var_funnnel_v2i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v2i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -157,16 +152,14 @@ define <2 x i64> @var_funnnel_v2i64(<2 x
;
; AVX512VBMI2-LABEL: var_funnnel_v2i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -174,14 +167,13 @@ define <2 x i64> @var_funnnel_v2i64(<2 x
;
; AVX512VLBW-LABEL: var_funnnel_v2i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsrlvq %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsllvq %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -363,16 +355,15 @@ define <4 x i32> @var_funnnel_v4i32(<4 x
;
; AVX512F-LABEL: var_funnnel_v4i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -380,29 +371,27 @@ define <4 x i32> @var_funnnel_v4i32(<4 x
;
; AVX512VL-LABEL: var_funnnel_v4i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VL-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v4i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -410,16 +399,15 @@ define <4 x i32> @var_funnnel_v4i32(<4 x
;
; AVX512VBMI2-LABEL: var_funnnel_v4i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -427,14 +415,13 @@ define <4 x i32> @var_funnnel_v4i32(<4 x
;
; AVX512VLBW-LABEL: var_funnnel_v4i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsrlvd %xmm4, %xmm1, %xmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsrlvd %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsllvd %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -727,17 +714,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x
;
; AVX512BW-LABEL: var_funnnel_v8i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -745,17 +730,15 @@ define <8 x i16> @var_funnnel_v8i16(<8 x
;
; AVX512VBMI2-LABEL: var_funnnel_v8i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -763,14 +746,13 @@ define <8 x i16> @var_funnnel_v8i16(<8 x
;
; AVX512VLBW-LABEL: var_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsrlvw %xmm4, %xmm1, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsrlvw %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsllvw %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -1056,21 +1038,19 @@ define <16 x i8> @var_funnnel_v16i8(<16
;
; AVX512BW-LABEL: var_funnnel_v16i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1078,21 +1058,19 @@ define <16 x i8> @var_funnnel_v16i8(<16
;
; AVX512VBMI2-LABEL: var_funnnel_v16i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1100,38 +1078,36 @@ define <16 x i8> @var_funnnel_v16i8(<16
;
; AVX512VLBW-LABEL: var_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
@@ -1294,14 +1270,13 @@ define <2 x i64> @splatvar_funnnel_v2i64
; AVX512F: # %bb.0:
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512F-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -1310,14 +1285,13 @@ define <2 x i64> @splatvar_funnnel_v2i64
; AVX512VL-LABEL: splatvar_funnnel_v2i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VL-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VL-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
;
@@ -1325,14 +1299,13 @@ define <2 x i64> @splatvar_funnnel_v2i64
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1342,14 +1315,13 @@ define <2 x i64> @splatvar_funnnel_v2i64
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1358,14 +1330,13 @@ define <2 x i64> @splatvar_funnnel_v2i64
; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpsrlq %xmm4, %xmm1, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpsrlq %xmm2, %xmm1, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsllq %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmq %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vptestnmq %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -1516,15 +1487,15 @@ define <4 x i32> @splatvar_funnnel_v4i32
; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512F-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpsrld %xmm5, %xmm1, %xmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpsrld %xmm3, %xmm1, %xmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512F-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512F-NEXT: vzeroupper
@@ -1533,16 +1504,15 @@ define <4 x i32> @splatvar_funnnel_v4i32
; AVX512VL-LABEL: splatvar_funnnel_v4i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpsrld %xmm5, %xmm1, %xmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpsrld %xmm3, %xmm1, %xmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VL-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VL-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VL-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VL-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
; AVX512VL-NEXT: retq
;
@@ -1551,15 +1521,15 @@ define <4 x i32> @splatvar_funnnel_v4i32
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpsrld %xmm5, %xmm1, %xmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpsrld %xmm3, %xmm1, %xmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512BW-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1570,15 +1540,15 @@ define <4 x i32> @splatvar_funnnel_v4i32
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT: vpsrld %xmm5, %xmm1, %xmm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT: vpsrld %xmm3, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VBMI2-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1587,16 +1557,15 @@ define <4 x i32> @splatvar_funnnel_v4i32
; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %xmm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpsrld %xmm5, %xmm1, %xmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpsrld %xmm3, %xmm1, %xmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VLBW-NEXT: vpslld %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmd %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vptestnmd %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -1779,16 +1748,15 @@ define <8 x i16> @splatvar_funnnel_v8i16
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX512BW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -1798,16 +1766,15 @@ define <8 x i16> @splatvar_funnnel_v8i16
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -1816,16 +1783,15 @@ define <8 x i16> @splatvar_funnnel_v8i16
; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm5, %xmm1, %xmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm1, %xmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vpor %xmm5, %xmm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmw %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX512VLBW-NEXT: vptestnmw %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -2053,18 +2019,17 @@ define <16 x i8> @splatvar_funnnel_v16i8
; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
@@ -2075,18 +2040,17 @@ define <16 x i8> @splatvar_funnnel_v16i8
; AVX512VBMI2-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %xmm4, %xmm2, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm6, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm5, %xmm6, %xmm5
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
-; AVX512VBMI2-NEXT: vpsllvw %zmm5, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
+; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm4, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI2-NEXT: vzeroupper
@@ -2095,19 +2059,18 @@ define <16 x i8> @splatvar_funnnel_v16i8
; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLBW-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLBW-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
@@ -2115,19 +2078,18 @@ define <16 x i8> @splatvar_funnnel_v16i8
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw %ymm5, %ymm6, %ymm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %ymm4, %ymm3, %ymm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX512VLVBMI2-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLVBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
+; AVX512VLVBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
; AVX512VLVBMI2-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512VLVBMI2-NEXT: vptestnmb %xmm3, %xmm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %xmm2, %xmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1}
; AVX512VLVBMI2-NEXT: vzeroupper
; AVX512VLVBMI2-NEXT: retq
Modified: llvm/trunk/test/CodeGen/X86/vector-fshr-256.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-fshr-256.ll?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-fshr-256.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-fshr-256.ll Sun Apr 14 11:26:11 2019
@@ -71,75 +71,70 @@ define <4 x i64> @var_funnnel_v4i64(<4 x
;
; AVX512F-LABEL: var_funnnel_v4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512F-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512F-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512F-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v4i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VL-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VL-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512VL-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v4i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512BW-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512BW-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v4i64:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VBMI2-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VBMI2-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v4i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsrlvq %ymm4, %ymm1, %ymm5
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [64,64,64,64]
-; AVX512VLBW-NEXT: vpsubq %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsrlvq %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [64,64,64,64]
+; AVX512VLBW-NEXT: vpsubq %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpsllvq %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -258,75 +253,70 @@ define <8 x i32> @var_funnnel_v8i32(<8 x
;
; AVX512F-LABEL: var_funnnel_v8i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512F-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512VL-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v8i32:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v8i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsrlvd %ymm4, %ymm1, %ymm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm6 = [32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsrlvd %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpsllvd %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -516,48 +506,43 @@ define <16 x i16> @var_funnnel_v16i16(<1
;
; AVX512BW-LABEL: var_funnnel_v16i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v16i16:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsrlvw %ymm4, %ymm1, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsrlvw %ymm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpsllvw %ymm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -796,79 +781,73 @@ define <32 x i8> @var_funnnel_v32i8(<32
;
; AVX512BW-LABEL: var_funnnel_v32i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLVBMI2-NEXT: retq
;
@@ -974,13 +953,13 @@ define <4 x i64> @splatvar_funnnel_v4i64
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
@@ -988,14 +967,13 @@ define <4 x i64> @splatvar_funnnel_v4i64
; AVX512VL-LABEL: splatvar_funnnel_v4i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
;
@@ -1004,13 +982,13 @@ define <4 x i64> @splatvar_funnnel_v4i64
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
@@ -1020,13 +998,13 @@ define <4 x i64> @splatvar_funnnel_v4i64
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastq %xmm2, %ymm2
; AVX512VBMI2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VBMI2-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VBMI2-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
@@ -1034,14 +1012,13 @@ define <4 x i64> @splatvar_funnnel_v4i64
; AVX512VLBW-LABEL: splatvar_funnnel_v4i64:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %ymm2
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [63,63,63,63]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpsrlq %xmm4, %ymm1, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpsrlq %xmm2, %ymm1, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsllq %xmm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmq %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vptestnmq %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -1147,15 +1124,15 @@ define <8 x i32> @splatvar_funnnel_v8i32
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: vpbroadcastd %xmm2, %ymm2
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpsrld %xmm5, %ymm1, %ymm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpsrld %xmm3, %ymm1, %ymm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512F-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512F-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
@@ -1163,16 +1140,15 @@ define <8 x i32> @splatvar_funnnel_v8i32
; AVX512VL-LABEL: splatvar_funnnel_v8i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpsrld %xmm5, %ymm1, %ymm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpsrld %xmm3, %ymm1, %ymm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VL-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VL-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VL-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
; AVX512VL-NEXT: retq
;
@@ -1181,15 +1157,15 @@ define <8 x i32> @splatvar_funnnel_v8i32
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastd %xmm2, %ymm2
; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpsrld %xmm5, %ymm1, %ymm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpsrld %xmm3, %ymm1, %ymm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512BW-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
@@ -1199,15 +1175,15 @@ define <8 x i32> @splatvar_funnnel_v8i32
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastd %xmm2, %ymm2
; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VBMI2-NEXT: vpsrld %xmm5, %ymm1, %ymm5
-; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VBMI2-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VBMI2-NEXT: vpsrld %xmm3, %ymm1, %ymm3
+; AVX512VBMI2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VBMI2-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VBMI2-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
@@ -1215,16 +1191,15 @@ define <8 x i32> @splatvar_funnnel_v8i32
; AVX512VLBW-LABEL: splatvar_funnnel_v8i32:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %ymm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpsrld %xmm5, %ymm1, %ymm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpsrld %xmm3, %ymm1, %ymm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VLBW-NEXT: vpslld %xmm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmd %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vptestnmd %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -1368,16 +1343,15 @@ define <16 x i16> @splatvar_funnnel_v16i
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm5, %ymm1, %ymm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512BW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
@@ -1386,16 +1360,15 @@ define <16 x i16> @splatvar_funnnel_v16i
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm5, %ymm1, %ymm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VBMI2-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VBMI2-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VBMI2-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VBMI2-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
@@ -1403,16 +1376,15 @@ define <16 x i16> @splatvar_funnnel_v16i
; AVX512VLBW-LABEL: splatvar_funnnel_v16i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm5, %ymm1, %ymm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %ymm1, %ymm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm4, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vpor %ymm5, %ymm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmw %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VLBW-NEXT: vptestnmw %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -1590,19 +1562,18 @@ define <32 x i8> @splatvar_funnnel_v32i8
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512BW-NEXT: retq
@@ -1611,19 +1582,18 @@ define <32 x i8> @splatvar_funnnel_v32i8
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
@@ -1631,38 +1601,36 @@ define <32 x i8> @splatvar_funnnel_v32i8
; AVX512VLBW-LABEL: splatvar_funnnel_v32i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLBW-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLBW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLBW-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLBW-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %ymm2
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpand %ymm3, %ymm2, %ymm4
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm5 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
-; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm6 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512VLVBMI2-NEXT: vpsrlvw %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %ymm4, %ymm6, %ymm4
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm3 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VLVBMI2-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2
+; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VLVBMI2-NEXT: vpsrlvw %zmm4, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %ymm2, %ymm4, %ymm4
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm4 = ymm4[0],zero,ymm4[1],zero,ymm4[2],zero,ymm4[3],zero,ymm4[4],zero,ymm4[5],zero,ymm4[6],zero,ymm4[7],zero,ymm4[8],zero,ymm4[9],zero,ymm4[10],zero,ymm4[11],zero,ymm4[12],zero,ymm4[13],zero,ymm4[14],zero,ymm4[15],zero,ymm4[16],zero,ymm4[17],zero,ymm4[18],zero,ymm4[19],zero,ymm4[20],zero,ymm4[21],zero,ymm4[22],zero,ymm4[23],zero,ymm4[24],zero,ymm4[25],zero,ymm4[26],zero,ymm4[27],zero,ymm4[28],zero,ymm4[29],zero,ymm4[30],zero,ymm4[31],zero
; AVX512VLVBMI2-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
; AVX512VLVBMI2-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
; AVX512VLVBMI2-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512VLVBMI2-NEXT: vptestnmb %ymm3, %ymm2, %k1
+; AVX512VLVBMI2-NEXT: vptestnmb %ymm2, %ymm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
; AVX512VLVBMI2-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vector-fshr-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-fshr-512.ll?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-fshr-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-fshr-512.ll Sun Apr 14 11:26:11 2019
@@ -18,40 +18,37 @@ declare <64 x i8> @llvm.fshr.v64i8(<64 x
define <8 x i64> @var_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v8i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpsrlvq %zmm4, %zmm1, %zmm5
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512F-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512F-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512F-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v8i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpsrlvq %zmm4, %zmm1, %zmm5
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512VL-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512VL-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512VL-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v8i64:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512BW-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512BW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
@@ -63,14 +60,13 @@ define <8 x i64> @var_funnnel_v8i64(<8 x
;
; AVX512VLBW-LABEL: var_funnnel_v8i64:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsrlvq %zmm4, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm6 = [64,64,64,64,64,64,64,64]
-; AVX512VLBW-NEXT: vpsubq %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsrlvq %zmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm4 = [64,64,64,64,64,64,64,64]
+; AVX512VLBW-NEXT: vpsubq %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsllvq %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -86,40 +82,37 @@ define <8 x i64> @var_funnnel_v8i64(<8 x
define <16 x i32> @var_funnnel_v16i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %amt) nounwind {
; AVX512F-LABEL: var_funnnel_v16i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpsrlvd %zmm4, %zmm1, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512F-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512F-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512F-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: var_funnnel_v16i32:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpsrlvd %zmm4, %zmm1, %zmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VL-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VL-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512VL-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: var_funnnel_v16i32:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
@@ -131,14 +124,13 @@ define <16 x i32> @var_funnnel_v16i32(<1
;
; AVX512VLBW-LABEL: var_funnnel_v16i32:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsrlvd %zmm4, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm6 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsrlvd %zmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm4 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsllvd %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -216,14 +208,13 @@ define <32 x i16> @var_funnnel_v32i16(<3
;
; AVX512BW-LABEL: var_funnnel_v32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
@@ -235,14 +226,13 @@ define <32 x i16> @var_funnnel_v32i16(<3
;
; AVX512VLBW-LABEL: var_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsllvw %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -394,28 +384,27 @@ define <64 x i8> @var_funnnel_v64i8(<64
;
; AVX512BW-LABEL: var_funnnel_v64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsllw $5, %zmm4, %zmm5
-; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512BW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512BW-NEXT: vpmovb2m %zmm5, %k2
-; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm5
+; AVX512BW-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512BW-NEXT: vpsrlw $2, %zmm3, %zmm5
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512BW-NEXT: vpsrlw $1, %zmm3, %zmm5
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512BW-NEXT: vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
-; AVX512BW-NEXT: vpsrlw $2, %zmm5, %zmm7
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512BW-NEXT: vpsrlw $1, %zmm5, %zmm7
-; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512BW-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512BW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %zmm4, %zmm6, %zmm4
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %zmm2, %zmm4, %zmm4
; AVX512BW-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm6
-; AVX512BW-NEXT: vpmovb2m %zmm6, %k1
+; AVX512BW-NEXT: vpaddb %zmm4, %zmm4, %zmm5
+; AVX512BW-NEXT: vpmovb2m %zmm5, %k1
; AVX512BW-NEXT: vpmovb2m %zmm4, %k2
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm4
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
@@ -423,38 +412,37 @@ define <64 x i8> @var_funnnel_v64i8(<64
; AVX512BW-NEXT: vpsllw $2, %zmm0, %zmm4
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
; AVX512BW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512BW-NEXT: vpaddb %zmm6, %zmm6, %zmm4
+; AVX512BW-NEXT: vpaddb %zmm5, %zmm5, %zmm4
; AVX512BW-NEXT: vpmovb2m %zmm4, %k1
; AVX512BW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: var_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VBMI2-NEXT: vpsllw $5, %zmm4, %zmm5
-; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k2
-; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm5
+; AVX512VBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
-; AVX512VBMI2-NEXT: vpsrlw $2, %zmm5, %zmm7
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VBMI2-NEXT: vpsrlw $1, %zmm5, %zmm7
-; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VBMI2-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %zmm4, %zmm6, %zmm4
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %zmm2, %zmm4, %zmm4
; AVX512VBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm6
-; AVX512VBMI2-NEXT: vpmovb2m %zmm6, %k1
+; AVX512VBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5
+; AVX512VBMI2-NEXT: vpmovb2m %zmm5, %k1
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k2
; AVX512VBMI2-NEXT: vpsllw $4, %zmm0, %zmm4
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
@@ -462,38 +450,37 @@ define <64 x i8> @var_funnnel_v64i8(<64
; AVX512VBMI2-NEXT: vpsllw $2, %zmm0, %zmm4
; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
; AVX512VBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm4
+; AVX512VBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm4
; AVX512VBMI2-NEXT: vpmovb2m %zmm4, %k1
; AVX512VBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: var_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsllw $5, %zmm4, %zmm5
-; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k2
-; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm5
+; AVX512VLBW-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vpsrlw $2, %zmm3, %zmm5
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vpsrlw $1, %zmm3, %zmm5
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
-; AVX512VLBW-NEXT: vpsrlw $2, %zmm5, %zmm7
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VLBW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VLBW-NEXT: vpsrlw $1, %zmm5, %zmm7
-; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VLBW-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLBW-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %zmm4, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLBW-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %zmm2, %zmm4, %zmm4
; AVX512VLBW-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm6
-; AVX512VLBW-NEXT: vpmovb2m %zmm6, %k1
+; AVX512VLBW-NEXT: vpaddb %zmm4, %zmm4, %zmm5
+; AVX512VLBW-NEXT: vpmovb2m %zmm5, %k1
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k2
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm4
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
@@ -501,38 +488,37 @@ define <64 x i8> @var_funnnel_v64i8(<64
; AVX512VLBW-NEXT: vpsllw $2, %zmm0, %zmm4
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
; AVX512VLBW-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512VLBW-NEXT: vpaddb %zmm6, %zmm6, %zmm4
+; AVX512VLBW-NEXT: vpaddb %zmm5, %zmm5, %zmm4
; AVX512VLBW-NEXT: vpmovb2m %zmm4, %k1
; AVX512VLBW-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: var_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm4, %zmm5
-; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm6
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k2
-; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm5
+; AVX512VLVBMI2-NEXT: vpsrlw $4, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm2, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vpblendmb %zmm3, %zmm1, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm3, %zmm5
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm3, %zmm5
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpblendmb %zmm5, %zmm1, %zmm5 {%k2}
-; AVX512VLVBMI2-NEXT: vpsrlw $2, %zmm5, %zmm7
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VLVBMI2-NEXT: vpsrlw $1, %zmm5, %zmm7
-; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm7, %zmm7
-; AVX512VLVBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm6
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1
-; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm7, %zmm5 {%k1}
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %zmm4, %zmm6, %zmm4
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm4
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
+; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm5, %zmm3 {%k1}
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %zmm2, %zmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpsllw $5, %zmm4, %zmm4
-; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm6
-; AVX512VLVBMI2-NEXT: vpmovb2m %zmm6, %k1
+; AVX512VLVBMI2-NEXT: vpaddb %zmm4, %zmm4, %zmm5
+; AVX512VLVBMI2-NEXT: vpmovb2m %zmm5, %k1
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k2
; AVX512VLVBMI2-NEXT: vpsllw $4, %zmm0, %zmm4
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
@@ -540,11 +526,11 @@ define <64 x i8> @var_funnnel_v64i8(<64
; AVX512VLVBMI2-NEXT: vpsllw $2, %zmm0, %zmm4
; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm4, %zmm4
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm4, %zmm0 {%k1}
-; AVX512VLVBMI2-NEXT: vpaddb %zmm6, %zmm6, %zmm4
+; AVX512VLVBMI2-NEXT: vpaddb %zmm5, %zmm5, %zmm4
; AVX512VLVBMI2-NEXT: vpmovb2m %zmm4, %k1
; AVX512VLVBMI2-NEXT: vpaddb %zmm0, %zmm0, %zmm0 {%k1}
-; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VLVBMI2-NEXT: retq
%res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> %amt)
@@ -559,42 +545,39 @@ define <8 x i64> @splatvar_funnnel_v8i64
; AVX512F-LABEL: splatvar_funnnel_v8i64:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512F-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm5
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512F-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
+; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512F-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpsllq %xmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v8i64:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VL-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpsrlq %xmm4, %zmm1, %zmm5
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VL-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
+; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VL-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpsllq %xmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v8i64:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512BW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512BW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpsllq %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
@@ -608,14 +591,13 @@ define <8 x i64> @splatvar_funnnel_v8i64
; AVX512VLBW-LABEL: splatvar_funnnel_v8i64:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastq %xmm2, %zmm2
-; AVX512VLBW-NEXT: vpbroadcastq {{.*#+}} zmm3 = [63,63,63,63,63,63,63,63]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpsrlq %xmm4, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [64,64]
-; AVX512VLBW-NEXT: vpsubq %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpsrlq %xmm2, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [64,64]
+; AVX512VLBW-NEXT: vpsubq %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpsllq %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmq %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmq %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -634,48 +616,45 @@ define <16 x i32> @splatvar_funnnel_v16i
; AVX512F-LABEL: splatvar_funnnel_v16i32:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512F-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512F-NEXT: vpsrld %xmm5, %zmm1, %zmm5
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512F-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512F-NEXT: vpsrld %xmm3, %zmm1, %zmm3
+; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512F-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512F-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512F-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512F-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512F-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512F-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: splatvar_funnnel_v16i32:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VL-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VL-NEXT: vpsrld %xmm5, %zmm1, %zmm5
-; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VL-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VL-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VL-NEXT: vpsrld %xmm3, %zmm1, %zmm3
+; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VL-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VL-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512VL-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512VL-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VL-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: splatvar_funnnel_v16i32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512BW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512BW-NEXT: vpsrld %xmm5, %zmm1, %zmm5
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512BW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512BW-NEXT: vpsrld %xmm3, %zmm1, %zmm3
+; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512BW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512BW-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
@@ -689,16 +668,15 @@ define <16 x i32> @splatvar_funnnel_v16i
; AVX512VLBW-LABEL: splatvar_funnnel_v16i32:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastd %xmm2, %zmm2
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} zmm3 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
-; AVX512VLBW-NEXT: vpandd %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero
-; AVX512VLBW-NEXT: vpsrld %xmm5, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm6 = [32,32,32,32]
-; AVX512VLBW-NEXT: vpsubd %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero
+; AVX512VLBW-NEXT: vpsrld %xmm3, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; AVX512VLBW-NEXT: vpsubd %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
; AVX512VLBW-NEXT: vpslld %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpord %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmd %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vpord %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmd %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -757,16 +735,15 @@ define <32 x i16> @splatvar_funnnel_v32i
; AVX512BW-LABEL: splatvar_funnnel_v32i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastw %xmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm5, %zmm1, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512BW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
@@ -780,16 +757,15 @@ define <32 x i16> @splatvar_funnnel_v32i
; AVX512VLBW-LABEL: splatvar_funnnel_v32i16:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastw %xmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm5, %zmm1, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [16,16,16,16,16,16,16,16]
-; AVX512VLBW-NEXT: vpsubw %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vpsubw %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmw %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmw %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
@@ -868,96 +844,92 @@ define <64 x i8> @splatvar_funnnel_v64i8
; AVX512BW-LABEL: splatvar_funnnel_v64i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpsrlw %xmm5, %zmm1, %zmm6
-; AVX512BW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512BW-NEXT: vpsrlw %xmm5, %zmm7, %zmm5
-; AVX512BW-NEXT: vpsrlw $8, %zmm5, %zmm5
-; AVX512BW-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512BW-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512BW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
+; AVX512BW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512BW-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512BW-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsllw %xmm4, %zmm7, %zmm4
+; AVX512BW-NEXT: vpsllw %xmm4, %zmm5, %zmm4
; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512BW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512BW-NEXT: retq
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VBMI2: # %bb.0:
; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VBMI2-NEXT: vpsrlw %xmm5, %zmm1, %zmm6
-; AVX512VBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VBMI2-NEXT: vpsrlw %xmm5, %zmm7, %zmm5
-; AVX512VBMI2-NEXT: vpsrlw $8, %zmm5, %zmm5
-; AVX512VBMI2-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512VBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
+; AVX512VBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512VBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm7, %zmm4
+; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm5, %zmm4
; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VBMI2-NEXT: retq
;
; AVX512VLBW-LABEL: splatvar_funnnel_v64i8:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLBW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLBW-NEXT: vpsrlw %xmm5, %zmm1, %zmm6
-; AVX512VLBW-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VLBW-NEXT: vpsrlw %xmm5, %zmm7, %zmm5
-; AVX512VLBW-NEXT: vpsrlw $8, %zmm5, %zmm5
-; AVX512VLBW-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512VLBW-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLBW-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
+; AVX512VLBW-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512VLBW-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm7, %zmm4
+; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm5, %zmm4
; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VLBW-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLBW-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v64i8:
; AVX512VLVBMI2: # %bb.0:
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2
-; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
-; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm5, %zmm1, %zmm6
-; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm5, %zmm7, %zmm5
-; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm5, %zmm5
-; AVX512VLVBMI2-NEXT: vpandq %zmm5, %zmm6, %zmm5
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
-; AVX512VLVBMI2-NEXT: vpsubb %xmm4, %xmm6, %xmm4
+; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
+; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4
+; AVX512VLVBMI2-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm5, %zmm3
+; AVX512VLVBMI2-NEXT: vpsrlw $8, %zmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3
+; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4
; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero
; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm7, %zmm4
+; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm5, %zmm4
; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4
; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vporq %zmm5, %zmm0, %zmm0
-; AVX512VLVBMI2-NEXT: vptestnmb %zmm3, %zmm2, %k1
+; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1
; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1}
; AVX512VLVBMI2-NEXT: retq
%splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer
Modified: llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll?rev=358359&r1=358358&r2=358359&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-lzcnt-512.ll Sun Apr 14 11:26:11 2019
@@ -359,16 +359,15 @@ define <32 x i16> @testv32i16(<32 x i16>
;
; AVX512BW-LABEL: testv32i16:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vptestnmb %zmm4, %zmm3, %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm5
-; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2
-; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3
-; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
@@ -442,16 +441,15 @@ define <32 x i16> @testv32i16u(<32 x i16
;
; AVX512BW-LABEL: testv32i16u:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vptestnmb %zmm4, %zmm3, %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm5
-; AVX512BW-NEXT: vpandq %zmm5, %zmm2, %zmm2
-; AVX512BW-NEXT: vpandq %zmm4, %zmm3, %zmm3
-; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1
-; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm2
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm2, %zmm1
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vptestnmb %zmm0, %zmm0, %k0
; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
; AVX512BW-NEXT: vpsrlw $8, %zmm0, %zmm0
@@ -549,16 +547,15 @@ define <64 x i8> @testv64i8(<64 x i8> %i
;
; AVX512BW-LABEL: testv64i8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm0, %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm4
-; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv64i8:
@@ -640,16 +637,15 @@ define <64 x i8> @testv64i8u(<64 x i8> %
;
; AVX512BW-LABEL: testv64i8u:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm2
-; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX512BW-NEXT: vptestnmb %zmm3, %zmm0, %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm4
-; AVX512BW-NEXT: vpandq %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufb %zmm0, %zmm1, %zmm0
-; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm1
+; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512BW-NEXT: vpshufb %zmm1, %zmm2, %zmm3
+; AVX512BW-NEXT: vpshufb %zmm0, %zmm2, %zmm0
+; AVX512BW-NEXT: vptestnmb %zmm1, %zmm1, %k0
+; AVX512BW-NEXT: vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpaddb %zmm3, %zmm0, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: testv64i8u:
More information about the llvm-commits
mailing list